In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
#warnings.simplefilter(action='ignore')

In [3]:
# I specify 'User DF as it's a different DF than the other notebook
user_df = pd.read_csv('googleplaystore_user_reviews.csv')

In [4]:
user_df = user_df.dropna(how='any')

In [5]:
# Create a joined corpus of text for each app
df_text = user_df.groupby('App')['Translated_Review'].apply(lambda x: ','.join(x)).reset_index()

Creating binaries for if each sentiment was positive, neutral or negative, then summing them

In [6]:
user_df['Positive'] = user_df['Sentiment'].map(dict(Positive=1, Neutral=0, Negative=0))
user_df['Neutral'] = user_df['Sentiment'].map(dict(Positive=0, Neutral=1, Negative=0))
user_df['Negative'] = user_df['Sentiment'].map(dict(Positive=0, Neutral=0, Negative=1))

In [7]:
df_sent = user_df.groupby('App').sum()

Turning the sum of positive/neutral/negative into ratios

In [8]:
df_sent['ratio_pos'] = df_sent['Positive'] / (df_sent['Neutral'] + df_sent['Negative'])

In [9]:
df_sent['ratio_neg'] = df_sent['Negative'] / (df_sent['Neutral'] + df_sent['Positive'])

In [10]:
df_sent['ratio_neut'] = df_sent['Neutral'] / (df_sent['Negative'] + df_sent['Positive'])

Counting total reviews

In [11]:
df_sent['total_reviews'] = df_sent['Positive'] + df_sent['Neutral'] + df_sent['Negative']

In [12]:
df_sent = df_sent.drop(columns=['Sentiment_Polarity','Sentiment_Subjectivity', 'Positive',
                                'Neutral', 'Negative'])

In [13]:
mean_df_sent = user_df.groupby('App').mean()
df_sent['Sentiment_Polarity'] = mean_df_sent['Sentiment_Polarity']
df_sent['Sentiment_Subjectivity'] = mean_df_sent['Sentiment_Subjectivity']

In [14]:
# setting infinities to max finite number in dataset
df_sent['ratio_pos'] = np.where(df_sent['ratio_pos'] == float('inf'), np.ma.masked_invalid(df_sent['ratio_pos']).max(), df_sent['ratio_pos'])
df_sent['ratio_pos'] = np.where(df_sent['ratio_neg'] == float('inf'), np.ma.masked_invalid(df_sent['ratio_neg']).max(), df_sent['ratio_neg'])
df_sent['ratio_neut'] = np.where(df_sent['ratio_neut'] == float('inf'), np.ma.masked_invalid(df_sent['ratio_neut']).max(), df_sent['ratio_neut'])

In [15]:
df_sent = df_sent.reset_index()

Feature Engineering

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vect = CountVectorizer(lowercase=True, strip_accents='ascii', stop_words='english', 
                       ngram_range=(1, 2), max_features=10000)

In [18]:
vect_text = vect.fit_transform(df_text['Translated_Review'])

In [19]:
vect_df = pd.DataFrame(vect_text.toarray())

In [20]:
vect_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
## Dimensionality Reduction ##

In [22]:
from sklearn.decomposition import TruncatedSVD

In [23]:
lsa = TruncatedSVD(n_components=100, n_iter=10, random_state=42)

In [24]:
lsa.fit(vect_df)

TruncatedSVD(n_components=100, n_iter=10, random_state=42)

In [25]:
svd_result = lsa.transform(vect_df)

In [26]:
svd_df = pd.DataFrame(svd_result)

In [27]:
svd_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,31.018723,47.979740,-3.306091,-22.099386,-1.305492,-12.191886,17.986619,2.546629,-10.213569,12.750877,...,-3.408999,-2.719671,7.582463,8.777137,-13.285420,1.122956,4.503685,5.546937,-4.809370,-1.263019
1,3.721123,8.743141,-0.753101,-4.143599,1.055024,0.266230,1.995338,-1.104359,-1.280609,0.645270,...,-1.270735,0.509643,0.201318,-1.612208,-0.387447,-0.545392,0.086825,0.967130,0.374730,0.940290
2,5.039151,8.758359,-1.276903,-0.657862,1.179077,0.565911,-2.180605,-0.490197,-0.737505,-1.194460,...,-0.279012,-0.292717,-0.670330,0.710428,0.247492,-0.064989,-0.054957,0.043286,-1.039450,0.823454
3,12.938707,24.381076,-1.415965,-5.783934,4.534844,-1.479366,7.137282,0.997214,4.636467,5.397325,...,1.594895,2.232551,-2.741194,-0.994313,-1.594104,0.081270,1.561410,-0.803989,-3.615361,-0.010048
4,14.055135,-3.068120,0.362680,-1.930117,-0.699440,0.553655,1.740922,1.272880,-0.061528,1.176033,...,-0.190866,0.581596,0.057250,-0.739997,0.349915,-0.543158,-0.441419,0.060130,-0.073423,0.629442
5,4.235730,6.389789,-1.120062,-1.757785,-0.276712,-0.755892,-0.120129,0.465693,-1.123231,0.596423,...,0.761765,-0.403304,-0.062724,0.502618,0.192950,-0.136904,0.463516,0.593300,-0.372199,0.141707
6,31.466058,44.955474,-7.781923,1.859160,3.049330,-8.384158,4.791935,5.086863,3.828207,2.666955,...,6.145452,3.746559,-3.635512,2.622979,0.389899,-0.306109,-4.962772,-2.065219,0.928067,-3.248208
7,8.108048,14.245328,-2.756664,-2.191892,0.651029,-0.463454,-0.944606,-1.458929,-1.373596,5.576529,...,-2.651393,1.342256,-1.734941,-2.362826,-0.393687,-1.607473,-2.602722,0.386377,-1.722462,3.627950
8,6.136860,9.807724,-1.524742,-1.147836,0.136493,-0.827375,-0.900379,0.365950,-1.019994,-0.824969,...,1.406943,0.909135,-1.086640,-0.454508,-0.655801,-0.836326,0.720279,-0.668287,0.076148,-0.287679
9,5.240006,7.701877,-0.977826,-2.665880,0.361889,0.091531,0.128108,-1.140371,-2.206198,3.400572,...,0.243600,0.363068,-1.227416,-1.529560,-0.070744,-0.836906,-0.040920,0.768973,-1.004852,1.139408


In [28]:
df_sent_lsa = pd.concat([df_sent, svd_df], axis=1)

In [29]:
df_sent_lsa

Unnamed: 0,App,ratio_pos,ratio_neg,ratio_neut,total_reviews,Sentiment_Polarity,Sentiment_Subjectivity,0,1,2,...,90,91,92,93,94,95,96,97,98,99
0,10 Best Foods for You,0.054348,0.054348,0.127907,194,0.470733,0.495455,31.018723,47.979740,-3.306091,...,-3.408999,-2.719671,7.582463,8.777137,-13.285420,1.122956,4.503685,5.546937,-4.809370,-1.263019
1,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,0.025641,0.025641,0.250000,40,0.392405,0.545516,3.721123,8.743141,-0.753101,...,-1.270735,0.509643,0.201318,-1.612208,-0.387447,-0.545392,0.086825,0.967130,0.374730,0.940290
2,11st,0.218750,0.218750,0.300000,39,0.185943,0.455340,5.039151,8.758359,-1.276903,...,-0.279012,-0.292717,-0.670330,0.710428,0.247492,-0.064989,-0.054957,0.043286,-1.039450,0.823454
3,1800 Contacts - Lens Store,0.081081,0.081081,0.142857,80,0.318145,0.591098,12.938707,24.381076,-1.415965,...,1.594895,2.232551,-2.741194,-0.994313,-1.594104,0.081270,1.561410,-0.803989,-3.615361,-0.010048
4,1LINE – One Line with One Touch,0.266667,0.266667,0.085714,38,0.196290,0.557315,14.055135,-3.068120,0.362680,...,-0.190866,0.581596,0.057250,-0.739997,0.349915,-0.543158,-0.441419,0.060130,-0.073423,0.629442
5,2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif,0.032258,0.032258,0.230769,32,0.449566,0.520573,4.235730,6.389789,-1.120062,...,0.761765,-0.403304,-0.062724,0.502618,0.192950,-0.136904,0.463516,0.593300,-0.372199,0.141707
6,21-Day Meditation Experience,0.142857,0.142857,0.025641,80,0.258014,0.551048,31.466058,44.955474,-7.781923,...,6.145452,3.746559,-3.635512,2.622979,0.389899,-0.306109,-4.962772,-2.065219,0.928067,-3.248208
7,"2Date Dating App, Love and matching",0.225806,0.225806,0.151515,38,0.280267,0.558391,8.108048,14.245328,-2.756664,...,-2.651393,1.342256,-1.734941,-2.362826,-0.393687,-1.607473,-2.602722,0.386377,-1.722462,3.627950
8,2GIS: directory & navigator,0.176471,0.176471,0.379310,40,0.223129,0.396658,6.136860,9.807724,-1.524742,...,1.406943,0.909135,-1.086640,-0.454508,-0.655801,-0.836326,0.720279,-0.668287,0.076148,-0.287679
9,2RedBeans,0.054054,0.054054,0.181818,39,0.412199,0.597868,5.240006,7.701877,-0.977826,...,0.243600,0.363068,-1.227416,-1.529560,-0.070744,-0.836906,-0.040920,0.768973,-1.004852,1.139408


In [30]:
df_sent_lsa.to_csv('Processed Google Play Reviews.csv', index=False)