# Data pre-processing to make it ready for the model to learn



#### Have you identified other variables transformations that you need to do based on the distributions you have analyzed so far?

In [1]:
#Feature engineering is the process of creating features (also called "attributes") 
#that don't already exist in the dataset. This means that if your dataset already contains enough 
#"useful" features, you don't necessarily need to engineer additional features.

Answer here

In [63]:
import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn import linear_model
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=Warning)
np.seterr(divide='ignore', invalid='ignore')

# Mount local files
#from google.colab import drive
#drive.mount('/gdrive')

# Hide deprecated warnings
import warnings
warnings.filterwarnings('ignore')

In [64]:
#  Load the clean dataset form previous notebook

In [65]:
# Load data from the folder where I uploaded the csv file data with grouped Features
data = pd.read_csv('/Users/AirMorena/Desktop/final_proj/csv/google_play_store/data_engineer1.csv')

# define functions

In [66]:
def one_hot_encode_categorical_variables(series, name):
      return pd.get_dummies(series, prefix=name)

In [67]:
# Check the dimensions
data.shape

(9351, 27)

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9351 entries, 0 to 9350
Data columns (total 27 columns):
App                         9351 non-null object
Category                    9351 non-null object
Rating                      9351 non-null float64
Reviews                     9351 non-null int64
Size                        9351 non-null float64
Installs                    9351 non-null int64
Type                        9351 non-null object
Price                       9351 non-null float64
Content Rating              9351 non-null object
Genres                      9351 non-null object
Last Updated                9351 non-null object
Current Ver                 9351 non-null object
Android Ver                 9351 non-null float64
day                         9351 non-null int64
month                       9351 non-null int64
year                        9351 non-null int64
month_year                  9351 non-null object
Current Ver_clean           9351 non-null float64
App_dup    

In [69]:
# if there are missing values: is there at least one missing  value? True or false
data.isnull().sum()

App                         0
Category                    0
Rating                      0
Reviews                     0
Size                        0
Installs                    0
Type                        0
Price                       0
Content Rating              0
Genres                      0
Last Updated                0
Current Ver                 0
Android Ver                 0
day                         0
month                       0
year                        0
month_year                  0
Current Ver_clean           0
App_dup                     0
Successful_App              0
Installs_group              0
reviews_levels              0
Size_group                  0
Current Ver_clean_levels    0
Android Ver_levels          0
st_Rating                   0
st_Price                    0
dtype: int64

## Dummify categorical variables

* We want our categorical variables to be part of our logistic regression model. We need to transform into boolean features?

In [70]:
data.columns.tolist()

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver',
 'day',
 'month',
 'year',
 'month_year',
 'Current Ver_clean',
 'App_dup',
 'Successful_App',
 'Installs_group',
 'reviews_levels',
 'Size_group',
 'Current Ver_clean_levels',
 'Android Ver_levels',
 'st_Rating',
 'st_Price']

In [71]:
#data.head()

In [72]:
# drop columns we will not use for the model: (the continous , since we create categorical from them)
data1=data.drop(['App','Successful_App','Size','Installs','Reviews','Android Ver','Price','Current Ver','Current Ver_clean','Rating',
                 'Price','Last Updated','Genres','day','month_year','st_Price','App_dup'], axis=1)

In [73]:
data1.columns.tolist()

['Category',
 'Type',
 'Content Rating',
 'month',
 'year',
 'Installs_group',
 'reviews_levels',
 'Size_group',
 'Current Ver_clean_levels',
 'Android Ver_levels',
 'st_Rating']

In [74]:
data1.head()

Unnamed: 0,Category,Type,Content Rating,month,year,Installs_group,reviews_levels,Size_group,Current Ver_clean_levels,Android Ver_levels,st_Rating
0,ART_AND_DESIGN,Free,Everyone,1,2018,low,low,small,low,high,-0.17964
1,ART_AND_DESIGN,Free,Everyone,1,2018,high,low,small,medium,high,-0.569116
2,ART_AND_DESIGN,Free,Everyone,8,2018,high,high,small,low,high,0.988789
3,ART_AND_DESIGN,Free,Teen,6,2018,high,high,big,high,high,0.599313
4,ART_AND_DESIGN,Free,Everyone,6,2018,low,low,small,low,high,0.209837


In [75]:
# feature engineering

In [76]:
# Transform categorical variables into boolean new columns with pandas get_dummies function
#total columns minus target variable 'Successful_App', and minus 'App' 
total_columns={
 'Category',
 'Type',
 'Content Rating',
 'month',
 'year',
 'reviews_levels',
 'Size_group',
#'Installs_group',# remove this one so that it can not interfer with the target
    'Current Ver_clean_levels','Android Ver_levels'
}

new_boolean_columns = []
for feature in total_columns:
    new_boolean_columns.append(one_hot_encode_categorical_variables(data1[feature], feature))

In [77]:
#keep numericals as st_Price,st_Rating

In [78]:
#new_boolean_columns
#pd.concat(new_boolean_columns, axis=1) 

In [79]:
# assign 0 and 1 to low and high in installs grouping
mapping={'high': 1, 'low':0}
# store this modified data (we just changed Gender column) into data_clean
data1['Installs_group']=data1['Installs_group'].map(mapping)

In [80]:
df_new_boolean_columns = pd.concat(new_boolean_columns, axis=1)  
#Drop old categorical features
data2 = data1.drop(total_columns, axis=1)

#Append dummy categorical features (now boolean)
final_feature_vector = pd.concat([data2[['Installs_group','st_Rating']], 
                                  df_new_boolean_columns], axis=1)

In [81]:
final_feature_vector.head() 

Unnamed: 0,Installs_group,st_Rating,Category_ART_AND_DESIGN,Category_AUTO_AND_VEHICLES,Category_BEAUTY,Category_BOOKS_AND_REFERENCE,Category_BUSINESS,Category_COMICS,Category_COMMUNICATION,Category_DATING,...,month_11,month_12,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated,Android Ver_levels_high,Android Ver_levels_low
0,0,-0.17964,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,-0.569116,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,1,0.988789,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,0.599313,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,0,0.209837,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [82]:
final_feature_vector.columns

Index(['Installs_group', 'st_Rating', 'Category_ART_AND_DESIGN',
       'Category_AUTO_AND_VEHICLES', 'Category_BEAUTY',
       'Category_BOOKS_AND_REFERENCE', 'Category_BUSINESS', 'Category_COMICS',
       'Category_COMMUNICATION', 'Category_DATING', 'Category_EDUCATION',
       'Category_ENTERTAINMENT', 'Category_EVENTS', 'Category_FAMILY',
       'Category_FINANCE', 'Category_FOOD_AND_DRINK', 'Category_GAME',
       'Category_HEALTH_AND_FITNESS', 'Category_HOUSE_AND_HOME',
       'Category_LIBRARIES_AND_DEMO', 'Category_LIFESTYLE',
       'Category_MAPS_AND_NAVIGATION', 'Category_MEDICAL',
       'Category_NEWS_AND_MAGAZINES', 'Category_PARENTING',
       'Category_PERSONALIZATION', 'Category_PHOTOGRAPHY',
       'Category_PRODUCTIVITY', 'Category_SHOPPING', 'Category_SOCIAL',
       'Category_SPORTS', 'Category_TOOLS', 'Category_TRAVEL_AND_LOCAL',
       'Category_VIDEO_PLAYERS', 'Category_WEATHER', 'reviews_levels_high',
       'reviews_levels_low', 'reviews_levels_medium',
      

### Get the final shape of your feature vector dataframe

In [83]:
final_feature_vector.shape

(9351, 74)

## Store the data into a new csv file

In [84]:
final_feature_vector.to_csv("/Users/AirMorena/Desktop/final_proj/csv/google_play_store/160619final_feature_vector.csv", index=False, encoding='utf-8')

In [85]:
final_feature_vector.head()

Unnamed: 0,Installs_group,st_Rating,Category_ART_AND_DESIGN,Category_AUTO_AND_VEHICLES,Category_BEAUTY,Category_BOOKS_AND_REFERENCE,Category_BUSINESS,Category_COMICS,Category_COMMUNICATION,Category_DATING,...,month_11,month_12,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated,Android Ver_levels_high,Android Ver_levels_low
0,0,-0.17964,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,-0.569116,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,1,0.988789,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,0.599313,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,0,0.209837,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
