# Train model

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('data\Reddit_Data.csv')
data = data.dropna()

In [3]:
data.head()

Unnamed: 0,text,sentiment
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [4]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])
y = data['sentiment']

In [5]:
# print x
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
from sklearn.metrics import classification_report

In [11]:
print('Accuracy: \n', accuracy_score(y_test, model.predict(X_test)))
print('\nConfusion matrix: \n', confusion_matrix(y_test, model.predict(X_test)))
print('\nClassification report: \n', classification_report(y_test, model.predict(X_test)))

Accuracy: 
 0.8421265141318978

Confusion matrix: 
 [[1136  176  285]
 [  42 2504  108]
 [ 235  327 2617]]

Classification report: 
               precision    recall  f1-score   support

          -1       0.80      0.71      0.75      1597
           0       0.83      0.94      0.88      2654
           1       0.87      0.82      0.85      3179

    accuracy                           0.84      7430
   macro avg       0.84      0.83      0.83      7430
weighted avg       0.84      0.84      0.84      7430



In [54]:
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump((model, vectorizer), model_file) 

In [55]:
model = pickle.load(open('sentiment_model.pkl', 'rb'))

# Test model

In [56]:
class SentimentClassifier:
    model_prediction  = {
        -1 : 'Negative',
        0 : 'Neutral',
        1 : 'Positive'
    }
    
    def __init__(self, model_file):
        with open(model_file, 'rb') as model_file:
            self.model, self.vectorizer = pickle.load(model_file)

    def predict(self, text):
        X = self.vectorizer.transform([text])
        y = self.model.predict(X)
        return SentimentClassifier.model_prediction[y[0]]

In [57]:
obj = SentimentClassifier('sentiment_model.pkl')

In [58]:
obj.predict('I love this product') 

'Positive'