### Module import

In [None]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine

## Step 1: Data Import: Apple dataset and Google dataset

### Apple app store dataset extraction and import

In [2]:
#import csv file into pandas
csv_data = pd.read_csv("AppleStore.csv", encoding="utf-8")
apple_df = pd.DataFrame(csv_data)
apple_df.head(10)



### Google app store dataset extraction, transformation

##### Store Google Dataset CSV into DataFrame

In [4]:
csv_file = "Google-Playstore-Full.csv"
google_df = pd.read_csv(csv_file,low_memory=False)
google_df.head()

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device,,,,
1,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.400671482,1207922,"100,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device,,,,
2,Peapod,SHOPPING,3.656329393,1967,"100,000+",1.4M,0,Everyone,"September 20, 2018",5.0 and up,2.2.0,,,,
3,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.107232571,389154,"10,000,000+",16M,0,Everyone,"March 22, 2019",4.2 and up,4.18.2,,,,
4,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.647752285,2291,"10,000+",Varies with device,$5.99,Everyone,"April 1, 2019",Varies with device,Varies with device,,,,


## Step2: Data Transformation: rename columns, drop missing values, drop duplicates, inner join

### Create new data with select columns

In [None]:
final_apple_df = apple_df[["track_name", "size_bytes", "user_rating", "rating_count_tot", "price", "cont_rating"]].copy()

In [5]:
new_google_df = google_df[['App Name', 'Category', 'Rating','Reviews','Installs','Size','Price','Content Rating']].copy()
new_google_df.head()

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone
1,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.400671482,1207922,"100,000,000+",Varies with device,0,Everyone
2,Peapod,SHOPPING,3.656329393,1967,"100,000+",1.4M,0,Everyone
3,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.107232571,389154,"10,000,000+",16M,0,Everyone
4,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.647752285,2291,"10,000+",Varies with device,$5.99,Everyone


### Rename the columns of Apple app list

In [None]:
#renamed with appended "_g" for google store columns and "_a" for apple store
final_apple_df.rename(columns = {'track_name':'App_Name', 'size_bytes': 'Size_a', 'user_rating': 'user_rating_a', 'rating_count_tot': 'rating_count_tot_a', 'price' : 'price_a', 'cont_rating': 'cont_rating_a'}, inplace = True)
final_apple_df.head()

### Drop duplicates and missing values of Apple app list

In [3]:
final_apple_df.drop_duplicates(inplace = True)
final_apple_df.dropna(inplace = True)
final_apple_df.head()

Unnamed: 0,App_Name,Size_a,user_rating_a,rating_count_tot_a,price_a,cont_rating_a
0,PAC-MAN Premium,100788224,4.0,21292,3.99,4+
1,Evernote - stay organized,158578688,4.0,161065,0.0,4+
2,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,3.5,188583,0.0,4+
3,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,4.0,262241,0.0,12+
4,Bible,92774400,4.5,985920,0.0,4+


### Rename the columns of Google app list

In [22]:
google_renamed=new_google_df.rename(columns={"App Name":"App_Name","Category": "Category_g", "Rating": "Rating_g",
                              "Reviews":"Reviews_g","Installs":"Installs_g",
                              "Size":"Size_g","Price":"Price_g","Content Rating":"Content_Rating_g"})
google_renamed.head()
google_renamed.describe()

Unnamed: 0,App_Name,Category_g,Rating_g,Reviews_g,Installs_g,Size_g,Price_g,Content_Rating_g
count,267051,267051,267052,267051,267052,267052,267052,267052
unique,244406,67,99856,24544,38,1248,504,12
top,????,EDUCATION,5,1,"10,000+",Varies with device,0,Everyone
freq,766,33394,23804,9203,60531,11726,255428,241578


### Drop duplicates and missing values of Google app list

In [7]:
google_renamed.dropna()
google_renamed.drop_duplicates(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267035 entries, 0 to 267051
Data columns (total 8 columns):
App_Name            267034 non-null object
Category_g          267034 non-null object
Rating_g            267035 non-null object
Reviews_g           267034 non-null object
Installs_g          267035 non-null object
Size_g              267035 non-null object
Price_g             267035 non-null object
Content_Rating_g    267035 non-null object
dtypes: object(8)
memory usage: 18.3+ MB


In [21]:
google_renamed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267035 entries, 0 to 267051
Data columns (total 8 columns):
App_Name            267034 non-null object
Category_g          267034 non-null object
Rating_g            267035 non-null object
Reviews_g           267034 non-null object
Installs_g          267035 non-null object
Size_g              267035 non-null object
Price_g             267035 non-null object
Content_Rating_g    267035 non-null object
dtypes: object(8)
memory usage: 18.3+ MB


### Merge two tables based the common key "App_Name"

In [8]:
app_df=google_renamed.merge(final_apple_df, how="inner", on="App_Name")

In [9]:
app_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1435 entries, 0 to 1434
Data columns (total 13 columns):
App_Name              1435 non-null object
Category_g            1435 non-null object
Rating_g              1435 non-null object
Reviews_g             1435 non-null object
Installs_g            1435 non-null object
Size_g                1435 non-null object
Price_g               1435 non-null object
Content_Rating_g      1435 non-null object
Size_a                1435 non-null int64
user_rating_a         1435 non-null float64
rating_count_tot_a    1435 non-null int64
price_a               1435 non-null float64
cont_rating_a         1435 non-null object
dtypes: float64(2), int64(2), object(9)
memory usage: 157.0+ KB


### Preview first five rows after merging two tables

In [10]:
app_df.head()

Unnamed: 0,App_Name,Category_g,Rating_g,Reviews_g,Installs_g,Size_g,Price_g,Content_Rating_g,Size_a,user_rating_a,rating_count_tot_a,price_a,cont_rating_a
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone,100554752,4.5,25947,0.0,4+
1,Allrecipes Dinner Spinner,FOOD_AND_DRINK,4.545353413,67514,"5,000,000+",Varies with device,0,Everyone,36399104,3.5,109349,0.0,12+
2,Domino's Pizza USA,FOOD_AND_DRINK,4.739675045,1177040,"10,000,000+",Varies with device,0,Everyone,105743360,5.0,258624,0.0,4+
3,Chick-fil-A,FOOD_AND_DRINK,4.374690533,52526,"5,000,000+",19M,0,Everyone,96230400,3.5,5665,0.0,4+
4,Amazon Prime Now,SHOPPING,3.924423456,24849,"10,000,000+",38M,0,Teen,48007168,3.0,895,0.0,4+


## Step 3: Data Load: import dataset to postgreSQL

### Connect to local database

In [15]:
rds_connection_string = "postgres:@localhost:5432/App_list"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [16]:
engine.table_names()

['App_list']

### Use pandas to load json converted DataFrame into database

In [17]:
app_df.to_sql(name='app_list', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the App_list table
* NOTE: can also check using pgAdmin

In [18]:
pd.read_sql_query('select * from app_list', con=engine).head()

Unnamed: 0,App_Name,Category_g,Rating_g,Reviews_g,Installs_g,Size_g,Price_g,Content_Rating_g,Size_a,user_rating_a,rating_count_tot_a,price_a,cont_rating_a
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone,100554752,4.5,25947,0.0,4+
1,Allrecipes Dinner Spinner,FOOD_AND_DRINK,4.545353413,67514,"5,000,000+",Varies with device,0,Everyone,36399104,3.5,109349,0.0,12+
2,Domino's Pizza USA,FOOD_AND_DRINK,4.739675045,1177040,"10,000,000+",Varies with device,0,Everyone,105743360,5.0,258624,0.0,4+
3,Chick-fil-A,FOOD_AND_DRINK,4.374690533,52526,"5,000,000+",19M,0,Everyone,96230400,3.5,5665,0.0,4+
4,Amazon Prime Now,SHOPPING,3.924423456,24849,"10,000,000+",38M,0,Teen,48007168,3.0,895,0.0,4+
