In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy_langdetect import LanguageDetector
%matplotlib inline

sns.set_style('whitegrid')

In [2]:
df = pd.read_csv('Reviews.csv',parse_dates=['at'])

In [3]:
desc = []

for i in df.columns:
    desc.append([
        i,
        df[i].dtypes,
        df[i].isna().sum(),
        df[i].nunique(),
        ])
#     print(i)
pd.DataFrame(desc,columns=['Features Name','Types','Total NaN','Total Unique Data'])

Unnamed: 0,Features Name,Types,Total NaN,Total Unique Data
0,Unnamed: 0,int64,0,34888
1,reviewId,object,0,24269
2,userName,object,2,24027
3,userImage,object,0,24256
4,content,object,0,23585
5,score,int64,0,5
6,thumbsUpCount,int64,0,159
7,reviewCreatedVersion,object,5173,3
8,at,datetime64[ns],0,24141
9,replyContent,object,34857,4


In [4]:
# Remove duplicated review sent by same user

df.drop_duplicates(subset=['score','content','userName'],inplace=True)

In [5]:
df['reviewLength'] = df.content.apply(lambda x: len(x))
df['wordCount'] = df.content.apply(lambda x: len(x.split()))

def scoring(score):
    if score > 3:
        return 'Positive'
    elif score < 3:
        return 'Negative'
    else:
        return "Neutral"
df['sentiment_score'] = df.score.apply(scoring)

In [6]:
## Language cleaning

#get languages by document
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    
def get_lang_doc(text): # gets average language probability scores and returns highest-ranked language
    doc = nlp(text)
    return doc._.language['language']

def get_langs_sent(text):
    langs={}
    lang_st={}
    doc = nlp(text)
    
    for i, sent in enumerate(doc.sents):
        langs[i]=sent._.language['language'] 
    
    lang_st = list(set(langs.values())) # return all language code.
    
    return lang_st

In [7]:
df['language'] = df.content.apply(get_lang_doc)
df['language_'] = df.content.apply(get_langs_sent)

In [8]:
def is_eng(x):
    if len(x) == 1 and 'en' in x:
        return 1
    else:
        return 0
df['en_'] = df['language_'].apply(is_eng)

In [9]:
#We want to focus on english reviews with more than 5 words

df = df[(df.en_ == 1) & (df.wordCount > 5)][['userName','at','content','sentiment_score','reviewLength','wordCount']]

In [10]:
#export cleaned reviews

# df.to_csv('clean_reviews.csv',index=False)

#### Next --> Data Processing