In [None]:
import pandas as pd
import random

In [None]:
data_yelp = pd.read_table('yelp_labelled.txt')
data_amazon = pd.read_table('amazon_cells_labelled.txt')
data_imdb = pd.read_table('imdb_labelled.txt')

# Joining the tables
combined_col= [data_amazon,data_imdb,data_yelp]

# To observe how the data in each individual dataset is structured
print(data_amazon.columns)

In [None]:
# In order to add headers for columns in each dataset

for colname in combined_col:
    colname.columns = ["Review","Label"]
for colname in combined_col:
    print(colname.columns)

In [None]:
# In order to recognize which dataset belonged to which company, a 'Company' column is added as a key

company = [ "Amazon", "imdb", "yelp"]

comb_data = pd.concat(combined_col,keys = company)

In [None]:
# Exploring the  structure of  the new  data  frame

print(comb_data.shape)

comb_data.head()

In [None]:
comb_data.to_csv("Sentiment_Analysis_Dataset")

print(comb_data.columns)

print(comb_data.isnull().sum())

In [None]:
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
nlp = en_core_web_sm.load()

# To build a list of stop words for filtering
stopwords = list(STOP_WORDS)
print(stopwords)

In [None]:
import string
punctuations = string.punctuation
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [None]:
def my_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [None]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [None]:
# Vectorization
vectorizer = CountVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 1), max_df=0.50, min_df=1, max_features=None)
classifier = LinearSVC()

In [None]:
tfvectorizer = TfidfVectorizer(tokenizer=my_tokenizer)

In [None]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
ds = tfds.load('amazon_us_reviews', split='train', shuffle_files=True)

In [None]:
tfds.as_dataframe(ds.take(4))

In [None]:
# Features and Labels
X = comb_data['Review']
ylabels = comb_data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify using"Count Vectorizor"
pipe_countvect = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])
# Fit our data
pipe_countvect.fit(X_train,y_train)
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)

In [None]:
#reviews = pd.read_csv('../csv/reviews_clean.csv').dropna()
reviews = pd.read_csv('../csv/reviews_translated.csv').dropna()

In [None]:
reviews.isna().value_counts()

In [None]:
review_train = random.sample(list(reviews['Review']), 20)

In [None]:
for text in review_train:
    print(text)
    print(pipe_countvect.predict([text]))
    print("------")

In [None]:
sentiment = pipe_countvect.predict(reviews['Review'])

In [None]:
reviews['sentiment'] = sentiment

In [None]:
reviews.to_csv('../csv/reviews_sentiment.csv', index=False)