In [1]:
import re
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

In [None]:
df = pd.read_csv('../data/Reviews.csv')
df.head()

In [None]:
df['Text'] = df['Text'] + ' ' + df['Summary']

In [None]:
#convert na to ""
df['Text'].fillna("", inplace=True)
df.isna().sum()

In [None]:
# Delete unused columns
del df['Id']
del df['ProfileName']
del df['Summary']
del df['Time']
del df['ProductId']

In [None]:
def preprocess(s):
    # Remove html tags
    s = re.sub('<\S+>', '', s)
    # Replace urls with token
    s = re.sub(r'http:\S+', 'url', s)
    s = re.sub(r'https:\S+', 'url', s)
    
    s = s.lower()
    # Remove any other special characters
    s = re.sub(r'[^a-z ]', ' ', s)
    
    words = s.split()
    result = []
    
    # Remove stop words and lemmatize the words
    for word in words:
        if word in stop_words:
            continue
        word = lemmatizer.lemmatize(word)
        result.append(word)
    return ' '.join(result)

df['Text'] = df['Text'].apply(preprocess)  

In [None]:
#df['Text'] = df['Text'] + ' ' + df['UserId']

In [None]:
df['Text'] = df['Text'].replace('', np.nan).dropna()
df.isna().sum()

In [None]:
df[df['Text'] == ''].index

In [None]:
# Take only rows which have atleast 5 votes
df = df[df['HelpfulnessDenominator'] >= 5]

In [None]:
df['Helpfulness'] = df['HelpfulnessNumerator']/df['HelpfulnessDenominator']
#df['Helpfulness'].fillna(0, inplace=True)

In [None]:
# Remove invalid helpfulness numbers
df['HelpDiff'] = df['HelpfulnessDenominator'] - df['HelpfulnessNumerator']
df.head()

In [None]:
df[df['HelpDiff'] < 0]

In [None]:
df = df[df['HelpDiff'] >= 0]
df[df['HelpDiff'] < 0]

In [None]:
del df['UserId']
del df['HelpfulnessNumerator']
del df['HelpfulnessDenominator']
del df['Score']
del df['HelpDiff']

In [None]:
df.head()

In [None]:
df.loc[:, 'Helpful'] = np.where(df.loc[:, 'Helpfulness'] > 0.50, 1, 0)
df.head()

In [None]:
del df['Helpfulness']

In [None]:
df[df['Text'] == ''].index

In [None]:
df.to_csv('../data/preprocessed_helpful.csv', index=False)

In [None]:
df['Helpful'].unique()

In [None]:
df['Text'].tolist()[63240]