In [None]:
# import data
import pandas as pd
import numpy as np 

text_data = pd.read_csv('text/amazon_cells_labelled.txt',sep='\t',header=None)
text_data.head()

In [None]:
#separate into data and labels
texts = text_data[0].values.tolist()
labels = text_data[1].values.tolist()

In [None]:
#import preprocessing libary
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stopword = set(stopwords.words('english'))
punct_tokenizer = nltk.RegexpTokenizer(r'\w+')

In [None]:
#cleaning data
filtered_texts = []

for text in texts:
    no_punct = punct_tokenizer.tokenize(text.lower())
    text = ' '.join(no_punct)
    tokens = word_tokenize(text)
    filtered_text = [w for w in tokens if not w in stopword]
    filtered_texts.append(' '.join(filtered_text))

filtered_texts[:10]

In [None]:
#splitting data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(filtered_texts,labels,test_size = 0.2)

In [None]:
#train using pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect',CountVectorizer()),    
    ('clf',MultinomialNB())
])

text_clf.fit(X_train,y_train)
text_clf.score(X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([
    ('vect',CountVectorizer()),
    ('clf',RandomForestClassifier())
])

text_clf.fit(X_train,y_train)
text_clf.score(X_test,y_test)

In [None]:
from sklearn.svm import LinearSVC

text_clf = Pipeline([
    ('vect',CountVectorizer()),
    ('clf',LinearSVC())
])

text_clf.fit(X_train,y_train)
text_clf.score(X_test,y_test)

In [None]:
#try the prediction
sample_text = 'the device was broken when i buy it'
text_clf.predict([sample_text])

In [None]:
#deploy model
import pickle

text_clf = Pipeline([
    ('vect',CountVectorizer()),
    ('clf',RandomForestClassifier())
])

text_clf.fit(filtered_texts,labels)

pickle.dump(text_clf,open('sentiment.pk','wb'))
