In [42]:
import numpy as np
import pandas as pd
import tarfile
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
data = pd.read_csv("Sentiment.csv", encoding='latin-1')
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [11]:
data = data[['SentimentText','Sentiment']]
data.head()

Unnamed: 0,SentimentText,Sentiment
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0


In [12]:
data.iloc[1,0]

'                   I missed the New Moon trailer...'

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99989 entries, 0 to 99988
Data columns (total 2 columns):
SentimentText    99989 non-null object
Sentiment        99989 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [14]:
data.describe()

Unnamed: 0,Sentiment
count,99989.0
mean,0.564632
std,0.495808
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [16]:
data.Sentiment.value_counts()

1    56457
0    43532
Name: Sentiment, dtype: int64

In [21]:
tweets = []
stopwords_set = set(stopwords.words("english"))
stopwords_set

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [29]:
for index, row in data.iterrows():
    words = [s.lower() for s in row.SentimentText.split() if len(s)>= 3]
    words_cleaned = [word for word in words 
                     if 'http' not in word
                    and not word.startswith('#')
                    and not word.startswith('@')]
    words_wo_sw = [word for word in words_cleaned 
                    if word not in stopwords_set] 
    
    words_joined = ' '.join(words_wo_sw)
    
    tweets.append((words_joined, row.Sentiment))

In [46]:
tweets_dataframe  = pd.DataFrame(tweets, columns=["Text", 'Sentiment'])
tweets_dataframe.head()

Unnamed: 0,Text,Sentiment
0,sad apl friend.............,0
1,missed new moon trailer...,0
2,omg already 7:30,1
3,omgaga. sooo gunna cry. i've dentist since 11....,0
4,think cheating me!!! t_t,0


In [47]:
cv = CountVectorizer()
X = cv.fit_transform(tweets_dataframe.Text)
#X = X.toarray(1)
X.shape

(99989, 51886)

In [39]:
le = LabelEncoder()
Y = le.fit_transform(tweets_dataframe.Sentiment)
Y

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [40]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(79991, 51886)
(79991,)
(19998, 51886)
(19998,)


In [41]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
y_train_pred = clf.predict(x_train)
print("Train accuracy = ", accuracy_score(y_train, y_train_pred))

y_test_pred = clf.predict(x_test)
print("Train accuracy = ", accuracy_score(y_test, y_test_pred))


Train accuracy =  0.8425822905076821
Train accuracy =  0.7453245324532454


In [50]:
cv = TfidfVectorizer()
X = cv.fit_transform(tweets_dataframe.Text)
#X = X.toarray()
X

<99989x51886 sparse matrix of type '<class 'numpy.float64'>'
	with 669940 stored elements in Compressed Sparse Row format>

In [51]:
le = LabelEncoder()
Y = le.fit_transform(tweets_dataframe.Sentiment)
Y

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(79991, 51886)
(79991,)
(19998, 51886)
(19998,)


In [53]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [54]:
y_train_pred = clf.predict(x_train)
print("Train accuracy = ", accuracy_score(y_train, y_train_pred))

y_test_pred = clf.predict(x_test)
print("Train accuracy = ", accuracy_score(y_test, y_test_pred))


Train accuracy =  0.8098911127501844
Train accuracy =  0.7474747474747475
