### Preprocessing

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('cleaned_non-vectorized_data.csv')
df = df[~df['Text'].isna()]
df

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1
2,layoff 20 workforce 100 employee sf bay area h...,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
9277,traik01 cdc people warmed u 2 year ago .... sa...,"['traik01', 'cdc', 'people', 'warmed', 'u', '2...",0,0,0,0,1,0,0
9278,sorry ’ promo code share lately 😭 promos autom...,"['sorry', '’', 'promo', 'code', 'share', 'late...",0,0,0,0,1,0,0
9279,poor lad http //t.co/36o565zsc3,"['poor', 'lad', 'http', '//t.co/36o565zsc3']",0,0,0,0,1,0,0
9280,one day able bill order tmobile bill sadly tod...,"['one', 'day', 'able', 'bill', 'order', 'tmobi...",0,0,0,0,1,0,0


In [2]:
#Tuning the parameters for tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
ngram_range=(1,1)
max_df =1.0
min_df =1
max_features=None

#createe vectorizer
vectorizer =  TfidfVectorizer(ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)
X = vectorizer.fit_transform(df['Text'])
X

<9279x20815 sparse matrix of type '<class 'numpy.float64'>'
	with 135513 stored elements in Compressed Sparse Row format>

In [3]:
y=df.loc[:,['disgust', 'joy', 'anger', 'surprised', 'sad', 'fear', 'neutral']]
y

Unnamed: 0,disgust,joy,anger,surprised,sad,fear,neutral
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
9277,0,0,0,0,1,0,0
9278,0,0,0,0,1,0,0
9279,0,0,0,0,1,0,0
9280,0,0,0,0,1,0,0


### Begin Machine Learning

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score

def print_classification_scores(y_test, pred):
    print('Accuracy Score:',accuracy_score(y_test, pred))
    print('Precision Score:',precision_score(y_test, pred, average='micro'))
    print('Recall Score:',recall_score(y_test, pred, average='micro'))
    print('F1 Score:',f1_score(y_test, pred, average='micro'))
    print('AUC Score:',roc_auc_score(y_test, pred, average='micro'))
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Example with  Logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression
param_grid={'estimator__C':[0.1,1,10]}
clf = GridSearchCV(MultiOutputClassifier(LogisticRegression()),param_grid=param_grid).fit(X_train, y_train)
print(clf.best_estimator_)
pred = clf.predict(X_test)
print_classification_scores(y_test, pred)

MultiOutputClassifier(estimator=LogisticRegression(C=10))
Accuracy Score: 0.5290948275862069
Precision Score: 0.7409988385598142
Recall Score: 0.5951492537313433
F1 Score: 0.6601138127263322
AUC Score: 0.7770178422049047
