In [None]:
import pandas as pd
import nltk
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

from nltk.corpus import stopwords
stop = stopwords.words('english')

def get_avg_length(review):
    split = review.split()
    length_each_word = [len(word) for word in split]
    sum_lengths = sum(length_each_word)
    return 0 if len(split) == 0 else sum_lengths/len(split)


def get_review_length(review):
    length = len(review.split())
    return length

def get_tweet_character_length(tweet):
    characters = [character for character in tweet if character not in not_counted]
    length = len(characters)
    return length

def remove_stopwords(review):
    split = review.split()
    new  = [word for word in split if word not in stop]
    return " ".join(new)

def to_lower(review):
    split = review.split()
    new = [word.lower() for word in split]
    return " ".join(new)
    

def remove_small(review):
    split = review.split()
    new = [word for word in split if len(word) > 2]
    return " ".join(new)

def remove_digits(review):
    split = reviews.split()
    new = [word for word in split if not word.isdigit()]
    return " ".join(new)
    

os.chdir("/Users/emmagoldberg/Downloads")
#read in our data
amazon_alexa = pd.read_csv("amazon_alexa.tsv", delimiter = '\t')
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].astype("str")
amazon_alexa = amazon_alexa[amazon_alexa.verified_reviews!= '']
#remove na rows
amazon_alexa = amazon_alexa.dropna()
#get the average rating and plot
avg_rating = amazon_alexa['rating'].mean()
rating_counts = amazon_alexa['rating'].value_counts()
#rating_counts.plot(kind = 'bar', color = 'seagreen')
#print("We can see that most reviews are 4 and 5 stars.")

#remove all but the reviews and rating column, as we are predicting ratings solely on text reviews
cols_to_remove = ['date', 'variation', 'feedback']
amazon_alexa = amazon_alexa.drop(cols_to_remove, axis = 1)
#cleaning up our text reviews
#remove punctuation
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].str.replace('[^\w\s]','')
#remove stopwords
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].apply(lambda review: remove_stopwords(review))
#change reviews to lowercase
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].apply(lambda review: to_lower(review))
#remove words that are two or fewer characters
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].apply(lambda review: remove_small(review) )
#remove most common words
reviews_string = ' '.join(amazon_alexa['verified_reviews'])
high_frequency = pd.Series(reviews_string.split()).value_counts()[:10]
#high_frequency.plot(kind = 'barh')
frequent_words = high_frequency.index
amazon_alexa['verified_reviews'] = amazon_alexa['verified_reviews'].apply(lambda review: " ".join(
    word for word in review.split() if word not in frequent_words))

#basic summary plots of our reviews after cleanup
lengths = amazon_alexa['verified_reviews'].apply(lambda review: get_review_length(review))
avg_lengths = amazon_alexa['verified_reviews'].apply(lambda review: get_avg_length(review))

summary = pd.DataFrame(dict(avg_rating = avg_rating, avg_review_length = lengths.mean(),
                           avg_word_length = avg_lengths.mean()), index = [0])
print(summary.head())
import matplotlib.pyplot as plt
plt.figure()
plt.hist(lengths, bins=10, color = 'lightskyblue')
plt.xlim(0, 130)
plt.xlabel("Review Lengths")
plt.ylabel("Count")
plt.show()

plt.figure()
plt.hist(avg_lengths, bins=20, color = 'blue')
plt.xlim(0, 20)
plt.xlabel("Average Word Lengths Per Review")
plt.ylabel("Count")
plt.show()

print("We can see most reviews are between 0 and 25 words, after cleanup.")
print("We can also see that it is most common for the average word length in a review to be 3-8.")
import numpy as np
#create the term-frequency matrix
my_vec = CountVectorizer(max_features = 300 )
X = my_vec.fit_transform(amazon_alexa['verified_reviews'])
df = pd.DataFrame(X.toarray(), columns = my_vec.get_feature_names())
df['rating'] = amazon_alexa['rating']
amazon_alexa = df
X = amazon_alexa.drop(['rating'], axis = 1)
y = amazon_alexa['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y)
#get our model
rf = RandomForestClassifier()
parameters = {'n_estimators': np.arange(100, 1100, 100).tolist(), 'min_samples_split': np.arange(50, 200, 10).tolist()}
grid_search = GridSearchCV(rf, parameters)
grid_search.fit(X_train, y_train)
params = grid_search.best_params_

#refit with best parameters
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
print(params)
print("Our training score is:", rf.score(X_train, y_train))
print("Our test set score is", rf.score(X_test, y_test))

#print out our classification report
predictions = rf.predict(X_test)
class_report = classification_report(y_test, predictions)
print("\n", class_report)











