# Sentiment Analysis con scikit-learn

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('reviews_en.xlsx', names=["text", "label"], encoding='utf8')

In [3]:
df.head()

Unnamed: 0,text,label
0,you cannot convince me they are not paying peo...,negative
1,yes you read that header right thats how revol...,negative
2,wow talk about the most misrepresented show on...,negative
3,worst movie i have seen in ever and it goes on...,negative
4,worst hero movie ever seen bad screenplay bad ...,negative


In [4]:
print(df.groupby('label').describe())

          text                                                               
         count unique                                                top freq
label                                                                        
negative   999    960  its an okay film epic and all of course the fa...    4
positive   999    999  i think that this movie was very interesting. ...    1


In [5]:
cl = {'positive': 1, 'negative': 0}
df['label'] = df['label'].map(cl)

In [6]:
df.head()

Unnamed: 0,text,label
0,you cannot convince me they are not paying peo...,0
1,yes you read that header right thats how revol...,0
2,wow talk about the most misrepresented show on...,0
3,worst movie i have seen in ever and it goes on...,0
4,worst hero movie ever seen bad screenplay bad ...,0


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1998):
    text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_features = 2000)
x = cv.fit_transform(corpus).toarray()
cl = df['label'].values

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, cl, test_size = 0.3, random_state = 12345)

In [12]:
# Regressione Logistica

In [13]:
from sklearn.linear_model.logistic import LogisticRegression

In [14]:
lr = LogisticRegression()

In [15]:
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
lr_pred = lr.predict(x_test)

In [17]:
from sklearn.metrics import confusion_matrix, classification_report

In [18]:
print(confusion_matrix(y_test, lr_pred))

[[279  33]
 [ 16 272]]


In [19]:
print(classification_report(y_test, lr_pred))

             precision    recall  f1-score   support

          0       0.95      0.89      0.92       312
          1       0.89      0.94      0.92       288

avg / total       0.92      0.92      0.92       600

