In [97]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/bagofwords-word2vec/labeledTrainData.tsv
/kaggle/input/bagofwords-word2vec/testData.tsv
/kaggle/input/bagofwords-word2vec/unlabeledTrainData.tsv
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip


In [98]:

os.listdir("/kaggle/input/")
df = pd.read_csv('/kaggle/input/bagofwords-word2vec/labeledTrainData.tsv', header = 0,  delimiter= "\t", quoting = 3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [99]:
print(df.isnull().sum())


id           0
sentiment    0
review       0
dtype: int64


In [100]:
method = "tfidf"
porterStemming=True
lemmatize=True

In [101]:
processed_reviews = []
stopSet = set(stopwords.words("english"))

if porterStemming :
    porter = nltk.PorterStemmer()

if lemmatize :
    WNlemma = nltk.WordNetLemmatizer()

for i in range(df.review.shape[0]):
    soup = BeautifulSoup(df.review[i])
    text = soup.find_all('p')[0].get_text()
    only_AZ = re.sub(r"[^a-zA-Z]"," ",text)
    words_az=only_AZ.lower().split()
    words = [w for w in words_az if not w in stopSet]
    if porterStemming : 
        w=[porter.stem(t) for t in words]
        words = w
    if lemmatize :
        w=[WNlemma.lemmatize(t) for t in words]
        words=w
    processed_reviews.append(" ".join(words))
df["processed"] = processed_reviews

In [102]:
df.head()

Unnamed: 0,id,sentiment,review,processed
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff go moment mj start listen music watch od...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war world timothi hine entertain film ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film start manag nichola bell give welcom inve...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assum prais film greatest film opera ever...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbl trashi wondrous unpretenti exploit hoo...


In [103]:
if method == "CountVectorizer" : 
    vect = CountVectorizer(max_features = 3000)
    X = vect.fit_transform(df["processed"])
    X=X.toarray()
elif method == "tfidf" :
    vect = TfidfVectorizer(max_features=9000, min_df=2).fit(df["processed"])
    X = vect.transform(df["processed"])

In [104]:
from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test=train_test_split(X,df["sentiment"],test_size=0.2,random_state=12)

In [105]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators = 100) 

# This may take a few minutes to run
#RF = RF.fit( x_train, y_train )

In [106]:
#pred = RF.predict(x_test)
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
#accuracy=accuracy_score(y_test,pred)
#roc_auc=roc_auc_score(y_test,pred)
print("accuracy :",accuracy)
print("roc_auc :",roc_auc)

accuracy : 0.843
roc_auc : 0.8431859159546785


In [107]:
RF = RF.fit( X,df["sentiment"] )

test = pd.read_csv("/kaggle/input/bagofwords-word2vec/testData.tsv", header=0, delimiter="\t",quoting=3 )
test_processed_reviews = []
stopSet = set(stopwords.words("english"))

if porterStemming :
    porter = nltk.PorterStemmer()

for i in range(test.review.shape[0]):
    soup = BeautifulSoup(test.review[i])
    text = soup.find_all('p')[0].get_text()
    only_AZ = re.sub(r"[^a-zA-Z]"," ",text)
    words_az=only_AZ.lower().split()
    words = [w for w in words_az if not w in stopSet]
    if porterStemming : 
        w=[porter.stem(t) for t in words]
        words = w
    if lemmatize :
        w=[WNlemma.lemmatize(t) for t in words]
        words=w
    test_processed_reviews.append(" ".join(words))
test["processed"] = test_processed_reviews
Xtest = vect.transform(test["processed"])
Xtest=Xtest.toarray()


In [108]:
prediction = RF.predict(Xtest)
output = pd.DataFrame( data={"id":test["id"], "sentiment":prediction} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_"+method+".csv", index=False, quoting=3 )