# First train to analisys problem and baseline

## Reading tsv

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv("train.tsv",sep="\t", index_col=0)

## Showing data header

In [2]:
display(data.head(2))
display(data.Sentiment.value_counts())

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2


2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

## Removing Stop words from data

In [3]:
import nltk
#nltk.download()
from nltk.corpus import stopwords

In [4]:
stop = stopwords.words('english')
data['Phrase'] = data.Phrase.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
display(data.head(2))

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series escapades demonstrating adage good go...,1
2,1,A series escapades demonstrating adage good goose,2


## To lowercase data

In [5]:
data['Phrase'] = data.Phrase.apply(lambda x: x.lower())
display(data.head(2))

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,a series escapades demonstrating adage good go...,1
2,1,a series escapades demonstrating adage good goose,2


## Stratifying Data

In [6]:
X = data.Phrase
y = data.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
display(X_train.head(2))
display(y_train.value_counts())
display(X_test.head(2))
display(y_test.value_counts())

PhraseId
35877                                     hairdo
56149    fun , celeb-strewn backdrop well used .
Name: Phrase, dtype: object

2    63665
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64

PhraseId
7964    passionate , somewhat flawed ,
5882                       thoughtless
Name: Phrase, dtype: object

2    15917
3     6585
1     5455
4     1841
0     1414
Name: Sentiment, dtype: int64

## Transforming data

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.fit_transform(X_test)
X_train = train_tfidf.toarray()
X_test = test_tfidf.toarray()
print(X_train.shape)

(124848, 5000)


## Training Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
Sentiment_model = RandomForestClassifier(n_estimators=25)
Sentiment_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Evaluating Model

In [10]:
from sklearn.metrics import classification_report

predictions = Sentiment_model.predict(X_test)
probabilities = Sentiment_model.predict_proba(X_test)
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.14      0.05      0.07      1414
           1       0.22      0.15      0.18      5455
           2       0.56      0.73      0.63     15917
           3       0.27      0.21      0.24      6585
           4       0.09      0.05      0.06      1841

   micro avg       0.45      0.45      0.45     31212
   macro avg       0.26      0.24      0.24     31212
weighted avg       0.39      0.45      0.41     31212



## Saving Model

In [14]:
import pickle
pickle.dump( Sentiment_model, open("Sentiment_model_baseline.pkl", "wb" ))
pickle.dump( tfidf, open("tfidf_baseline.pkl", "wb" ))