In [17]:
import pandas as pd

# import data
path_data_train = "/Users/jerome/Documents/level_up/ML/data/sentiment_train.csv"
train = pd.DataFrame.from_csv(path_data_train, encoding = "ISO-8859-1", index_col = None)
path_data_test = "/Users/jerome/Documents/level_up/ML/data/sentiment_test.csv"
test = pd.DataFrame.from_csv(path_data_test, encoding = "ISO-8859-1", index_col = None)

SEED = 42

In [18]:
# extract data

texts_train = train['tweet'].fillna("").values
labels_train = train['sentiment'].values

texts_test = test['tweet'].fillna("").values
labels_test = test['sentiment'].values

from nltk.stem import SnowballStemmer
from nltk import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

stemmer = SnowballStemmer("english")

# preprocess a word
# either a numeric, a username, or a stemmed word
def preprocessingWord(word):
    if re.search('^\d+$', word):
        return "_numeric_"
    elif re.search("^@", word):
        return "_username_"
    else:
        return stemmer.stem(word)

tokenizer = RegexpTokenizer('@?\w+', flags = re.UNICODE, gaps = False)

# preprocessing done on the sentence when use in tf-idf
def preprocessingTweet(tweet):
    tweet = re.sub("https?://[^ ]+", "", tweet) # remove url
    return " ".join([preprocessingWord(word) for word in tokenizer.tokenize(tweet)])

# we keep only the words with more than 5 occurences
tfidf = TfidfVectorizer(min_df=5,
                        preprocessor=preprocessingTweet)

X_train = tfidf.fit_transform(texts_train)
print(X_train.shape)
X_test = tfidf.transform(texts_test)
print(X_test.shape)

(22633, 3614)
(4748, 3614)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

logistic = LogisticRegression(random_state=SEED)
kfold = KFold(n_splits=3, shuffle=True, random_state=SEED)

parameters = {'C': [0.01, 0.1, 1, 10]}
scoring = "accuracy"

cross = GridSearchCV(logistic, parameters, 
                     scoring=scoring,
                     cv=kfold, 
                     verbose=2
                    )

cross.fit(X_train, labels_train)
print("\nthe best score is of {} for the follwing parameters {}\n".format(cross.best_score_, cross.best_params_))

y_pred = cross.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print(classification_report(labels_test, y_pred))
print(confusion_matrix(labels_test, y_pred))
print("\nthe accuracy on the test set is of {}".format(accuracy_score(labels_test, y_pred)))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   0.2s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ........................................... C=0.01, total=   0.1s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   0.1s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   0.2s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   0.2s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   0.2s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=   0.4s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=   0.4s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    4.2s finished



the best score is of 0.7481553483851014 for the follwing parameters {'C': 1}

             precision    recall  f1-score   support

   negative       0.80      0.84      0.82      1643
    neutral       0.72      0.80      0.76      2063
   positive       0.71      0.49      0.58      1042

avg / total       0.75      0.75      0.74      4748

[[1382  213   48]
 [ 244 1659  160]
 [ 109  421  512]]

the accuracy on the test set is of 0.7483150800336984


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

count = CountVectorizer(
            min_df=5,
            preprocessor=lambda tweet: preprocessingTweet(tweet))

X_train_count = count.fit_transform(texts_train)
X_test_count = count.transform(texts_test)

multinomial = MultinomialNB()

parameters = {'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1]}

cross = GridSearchCV(multinomial, parameters, 
                     scoring=scoring,
                     cv=kfold, 
                     verbose=2
                    )

cross.fit(X_train_count, labels_train)

print("\nthe best score is of {} for the following parameters {}\n".format(cross.best_score_, cross.best_params_))

y_pred = cross.predict(X_test_count)

print(classification_report(labels_test, y_pred))
print(confusion_matrix(labels_test, y_pred))
print("\nThe accuracy on the test set is of {}".format(accuracy_score(labels_test, y_pred)))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] alpha=0 .........................................................
[CV] .......................................... alpha=0, total=   0.1s
[CV] alpha=0 .........................................................
[CV] .......................................... alpha=0, total=   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .......................................... alpha=0, total=   0.1s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   0.1s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   0.1s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   0.1s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   0.1s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   0.1s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   0.1s
[CV] alpha=0.6 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.6s finished


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

parameters = {
    'vect__min_df': (1, 5, 10),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'clf__C': (0.1, 1, 10)
}

grid_search = GridSearchCV(pipeline, parameters, verbose=2, cv=kfold, n_jobs=-1)
grid_search.fit(texts_train,labels_train)

print("\nthe best score is of {} for the following parameters {}\n".format(grid_search.best_score_, grid_search.best_params_))

y_pred = grid_search.predict(texts_test)

print(classification_report(labels_test, y_pred))
print(confusion_matrix(labels_test, y_pred))
print("\nThe accuracy on the test set is of {}".format(accuracy_score(labels_test, y_pred)))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1 ............
[CV] vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1 ............
[CV] vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1 ............
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=0.1 ............
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1, total=   1.9s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=0.1 ............
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1, total=   1.9s
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=0.1, total=   1.9s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=0.1 ............
[CV] vect__ngram_range=(1, 1), vect__min_df=5, clf__C=0.1 ............
[CV]  vect__ngram_range=(1, 1), vect__min_df=5, clf__C=0.1, total=   1.5s
[CV] vect__ngram_range=(1, 1), vect__min_df=5, clf__C=0.1 ............
[CV]  vect__ngram_range=(1, 2), vect__min_df=1, clf__C=0.1, total=   5.1s
[

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.2s


[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=10, total=   2.6s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, clf__C=10 .............
[CV]  vect__ngram_range=(1, 2), vect__min_df=10, clf__C=1, total=   4.4s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, clf__C=10 .............
[CV]  vect__ngram_range=(1, 2), vect__min_df=10, clf__C=1, total=   4.5s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=10 .............
[CV]  vect__ngram_range=(1, 2), vect__min_df=10, clf__C=1, total=   4.5s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=10 .............
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=10, total=   2.8s
[CV] vect__ngram_range=(1, 2), vect__min_df=1, clf__C=10 .............
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, clf__C=10, total=   2.8s
[CV] vect__ngram_range=(1, 1), vect__min_df=5, clf__C=10 .............
[CV]  vect__ngram_range=(1, 1), vect__min_df=5, clf__C=10, total=   2.5s
[CV] vect__ngram_range=(1, 1), vect__min_df=5, clf__C=10 ......

[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.2min finished



the best score is of 0.7598197322493704 for the following parameters {'vect__ngram_range': (1, 2), 'vect__min_df': 1, 'clf__C': 10}

             precision    recall  f1-score   support

   negative       0.80      0.87      0.83      1643
    neutral       0.77      0.76      0.76      2063
   positive       0.69      0.61      0.65      1042

avg / total       0.76      0.76      0.76      4748

[[1422  174   47]
 [ 257 1566  240]
 [  99  304  639]]

The accuracy on the test set is of 0.7639005897219882
