In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/avishek-hotel/sentiments.csv


In [2]:
df=pd.read_csv("/kaggle/input/avishek-hotel/sentiments.csv",encoding='ISO-8859–1',header=0)
df.head()

Unnamed: 0,text,review
0,The premises are very peaceful and well mainta...,1
1,5 min drive out of Pisa or you need to catch t...,0
2,My family of 5 stayed at the residence San Ros...,1
3,We stayed here for a couple of nights on the w...,1
4,The accomodation was simple but more than adeq...,1


In [3]:
df.loc[ df['review'] ==1, 'review'] = 2

In [4]:
df.loc[ df['review'] ==0, 'review'] = 1

In [5]:
df.loc[ df['review'] ==-1, 'review'] = 0

In [6]:
df

Unnamed: 0,text,review
0,The premises are very peaceful and well mainta...,2
1,5 min drive out of Pisa or you need to catch t...,1
2,My family of 5 stayed at the residence San Ros...,2
3,We stayed here for a couple of nights on the w...,2
4,The accomodation was simple but more than adeq...,2
...,...,...
58607,It is hard for me to review an oceanfront hote...,1
58608,"I live close by, and needed to stay somewhere ...",2
58609,Rolled in 11:30 laid out heads down woke up to...,2
58610,Absolutely terrible..I was told I was being gi...,0


In [7]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df["text_clean"] = df["text"].apply(lambda x: clean_text(x))

In [8]:

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["text_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = df["text_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
df = pd.concat([df, doc2vec_df], axis=1)

In [9]:
# add number of characters column
df["nb_chars"] = df["text"].apply(lambda x: len(x))

# add number of words column
df["nb_words"] = df["text"].apply(lambda x: len(x.split(" ")))

In [10]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(df["text_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = df.index
df = pd.concat([df, tfidf_df], axis=1)

In [11]:
df

Unnamed: 0,text,review,text_clean,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,nb_chars,nb_words,...,word_yunker,word_yup,word_zealand,word_zen,word_zero,word_zip,word_zone,word_zoo,word_ztl,word_ÿthe
0,The premises are very peaceful and well mainta...,2,premise peaceful well maintain apartment spaci...,0.019709,0.010353,-0.178592,0.031814,0.049317,576,97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5 min drive out of Pisa or you need to catch t...,1,min drive pisa need catch bus far walk.beautif...,0.036190,-0.164353,-0.181781,0.158485,0.152218,208,35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,My family of 5 stayed at the residence San Ros...,2,family stay residence san rossore night great ...,0.089884,0.013429,0.207337,-0.239732,0.150019,753,138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,We stayed here for a couple of nights on the w...,2,stayed couple night way tuscany amp need somew...,-0.040634,0.068628,0.200889,-0.090924,0.109411,594,110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The accomodation was simple but more than adeq...,2,accomodation simple adequate spotlessly clean ...,0.001089,-0.010359,-0.147511,0.130542,0.067796,337,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58607,It is hard for me to review an oceanfront hote...,1,hard review oceanfront hotel go ocean necessar...,0.007996,0.112804,-0.006084,0.136191,-0.068845,339,61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58608,"I live close by, and needed to stay somewhere ...",2,live close need stay somewhere night due renov...,-0.053087,0.104279,-0.121130,0.044849,-0.040387,315,61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58609,Rolled in 11:30 laid out heads down woke up to...,2,roll laid head wake continental breakfast roll...,-0.099000,0.048321,0.013396,0.002030,0.156232,239,44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58610,Absolutely terrible..I was told I was being gi...,0,absolutely terrible..i tell give non smoke roo...,-0.289175,-0.179853,-0.236936,0.123757,0.147060,338,61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
label = "review"
ignore_cols = [label, "text", "text_clean"]
features = [c for c in df.columns if c not in ignore_cols]

In [13]:
# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[features], df[review], test_size = 0.20, random_state = 42)

NameError: name 'review' is not defined

In [14]:
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

# show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

NameError: name 'X_train' is not defined