In [9]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

In [10]:
# Read data
dfTrain = pd.read_csv('text_emotion_data.csv')
dfTest = pd.read_csv('text_emotion_validate.csv')

# Catagorize emotions
dfTrain['sentiment'] = dfTrain.sentiment.astype('category').values.codes
dfTest['sentiment'] = dfTest.sentiment.astype('category').values.codes

In [11]:
# Split data in X and Y
allTweetsTrain = dfTrain['content']
allTweetsTest = dfTest['content']

allEmotionsTrain = dfTrain['sentiment']
allEmotionsTest = dfTest['sentiment']

In [12]:
# Vectorize the sentences in the tweets
vectorizeTweets = CountVectorizer(stop_words='english', ngram_range=(1,4))
trainCounts = vectorizeTweets.fit_transform(allTweetsTrain)
testCounts = vectorizeTweets.transform(allTweetsTest)

# Print shape of sets to check if features match
print(trainCounts.shape)
print(testCounts.shape)

(36000, 549011)
(4000, 549011)


In [13]:
# Tfidf-scale the tweets
tfidf_transformer = TfidfTransformer()
trainTweetstfidf = tfidf_transformer.fit_transform(trainCounts)
testTweetsifidf = tfidf_transformer.fit_transform(testCounts)

# print shape of sets to check if features match
print(trainTweetstfidf.shape)
print(testTweetsifidf.shape)

(36000, 549011)
(4000, 549011)


In [14]:
# Fit a model
clf = SGDClassifier().fit(trainTweetstfidf, allEmotionsTrain)

In [15]:
# Predict the test set
prediction = clf.predict(testTweetsifidf)

In [16]:
# print accuracy
accuracy_score(prediction,allEmotionsTest)

0.33625