In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

In [2]:
data_path = "../data/"

df = pd.read_csv(data_path + "cleaned_Tweets.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,570306133677760513,0,1.0,,,Virgin America,,cairdin,,0,virginamerica dhepburn say,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,1,570301130888122368,1,0.3486,,0.0,Virgin America,,jnardino,,0,virginamerica plus add commercial experience t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,2,570301083672813571,0,0.6837,,,Virgin America,,yvonnalynn,,0,virginamerica today must mean need take anothe...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,3,570301031407624196,-1,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,virginamerica really aggressive blast obnoxiou...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,4,570300817074462722,-1,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,virginamerica really big bad thing,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
vectorizer = TfidfVectorizer(min_df=0.0001, max_df=0.9999, ngram_range=(1,3), max_features=10000)
X = vectorizer.fit_transform(df.text)

scaler = StandardScaler(with_mean=False)
scaler.fit(X)

xTrain, xTest, yTrain, yTest = train_test_split(X, df.airline_sentiment, train_size=0.8, stratify=df.airline_sentiment)

In [4]:
def evaluate_score(yTest, yPred):
    accuracy = accuracy_score(yTest, yPred)
    precisions = precision_score(yTest, yPred, average=None)
    recalls = recall_score(yTest, yPred, average=None)
    f1s = f1_score(yTest, yPred, average=None)

    print("Accuracy:\t", accuracy)
    print("Precision:\t", precisions)
    print("Recall: \t", recalls)
    print("F1 scores:\t", f1s)

    print(f"Average\n\tPrecision: {precision_score(yTest, yPred, average='weighted')}", end='\n\t')
    print(f"Recall: {recall_score(yTest, yPred, average='weighted')}", end='\n\t')
    print(f"F1: {f1_score(yTest, yPred, average='weighted')}")

In [5]:
clf = OneVsRestClassifier(Perceptron(n_jobs=-1), n_jobs=-1)
clf.fit(scaler.transform(xTrain), yTrain)

yPred = clf.predict(xTest)

precisions = precision_score(yTest, yPred, average=None)
recalls = recall_score(yTest, yPred, average=None)
f1s = f1_score(yTest, yPred, average=None)

evaluate_score(yTest, yPred)

Accuracy:	 0.755464480874317
Precision:	 [0.8208502  0.57613169 0.66523605]
Recall: 	 [0.88392371 0.4516129  0.65539112]
F1 scores:	 [0.85122015 0.50632911 0.66027689]
Average
	Precision: 0.743892902974228
	Recall: 0.755464480874317
	F1: 0.7473442619602202
