In [2]:
%run ../include/util.ipynb


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline



# /Users/dduru/PythonProjects/data/sentiment-analysis-on-movie-reviews/train.tsv

df = read_csv_frame(delimiter='\t', header=0)
print(df.count())

print(df['Sentiment'].value_counts())
print((df['Sentiment'].value_counts() * 100) / df['Sentiment'].count())


X, Y = df['Phrase'], df['Sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.5)

pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(solver='lbfgs'))
])
parameters = {
    'vect__max_df': (0.25, 0.5),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}
grid_search = GridSearchCV(
    pipeline, 
    parameters, 
    n_jobs=-1, 
    verbose=1, 
    scoring='accuracy'
)
grid_search.fit(X_train, Y_train)

print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameter set: ')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))
    
predictions = grid_search.predict(X_test)
print('Accuracy: %s' % accuracy_score(Y_test, predictions))
print('Confusion matrix: ')
print(confusion_matrix(Y_test, predictions))
print('Classification Rpeort:')
print(classification_report(Y_test, predictions))




Enter path to CSV: /Users/dduru/PythonProjects/data/sentiment-analysis-on-movie-reviews/train.tsv
PhraseId      156060
SentenceId    156060
Phrase        156060
Sentiment     156060
dtype: int64
2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64
2    50.994489
3    21.098936
1    17.475971
4     5.899013
0     4.531590
Name: Sentiment, dtype: float64
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.1min finished


Best score: 0.621
Best parameter set: 
tclf__C: 10
tvect__max_df: 0.25
tvect__ngram_range: (1, 2)
tvect__use_idf: False
Accuracy: 0.6357554786620531
Confusion matrix: 
[[ 1162  1620   649    64     8]
 [  959  5929  6213   528    40]
 [  207  3092 32776  3594   154]
 [   20   389  6665  8126  1284]
 [    4    32   523  2377  1615]]
Classification Rpeort:
              precision    recall  f1-score   support

           0       0.49      0.33      0.40      3503
           1       0.54      0.43      0.48     13669
           2       0.70      0.82      0.76     39823
           3       0.55      0.49      0.52     16484
           4       0.52      0.35      0.42      4551

    accuracy                           0.64     78030
   macro avg       0.56      0.49      0.52     78030
weighted avg       0.62      0.64      0.62     78030

