### Apple app store dataset extraction, transformation

In [3]:
import pandas as pd

#import csv file into pandas
csv_data = pd.read_csv("AppleStore.csv", encoding="utf-8")
apple_df = pd.DataFrame(csv_data)
apple_df.head(10)

final_apple_df = apple_df[["track_name", "size_bytes", "user_rating", "rating_count_tot", "price", "cont_rating"]].copy()

#renamed with appended "_g" for google store columns and "_a" for apple store
final_apple_df.rename(columns = {'track_name':'App Name', 'size_bytes': 'Size_a', 'user_rating': 'user_rating_a', 'rating_count_tot': 'rating_count_tot_a', 'price' : 'price_a', 'cont_rating': 'cont_rating_a'}, inplace = True)
final_apple_df.head()

final_apple_df.drop_duplicates(inplace = True)
final_apple_df.dropna(inplace = True)
final_apple_df.head()

Unnamed: 0,App Name,Size_a,user_rating_a,rating_count_tot_a,price_a,cont_rating_a
0,PAC-MAN Premium,100788224,4.0,21292,3.99,4+
1,Evernote - stay organized,158578688,4.0,161065,0.0,4+
2,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,3.5,188583,0.0,4+
3,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,4.0,262241,0.0,12+
4,Bible,92774400,4.5,985920,0.0,4+


### Google app store dataset extraction, transformation

In [2]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine

### Store Google Dataset CSV into DataFrame

csv_file = "Google-Playstore-Full.csv"
google_df = pd.read_csv(csv_file,low_memory=False)
google_df.head()

### Create new data with select columns

new_google_df = google_df[['App Name', 'Category', 'Rating','Reviews','Installs','Size','Price','Content Rating']].copy()
new_google_df.head()

### Rename the columns based on the dataset source

google_renamed=new_google_df.rename(columns={"Category": "Category_g", "Rating": "Rating_g",
                              "Reviews":"Reviews_g","Installs":"Installs_g",
                              "Size":"Size_g","Price":"Price_g","Content Rating":"Content Rating_g"})
google_renamed.head()

google_renamed.describe()

### Data Cleaning

google_renamed.dropna()

google_renamed.drop_duplicates(inplace=True)

google_renamed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267035 entries, 0 to 267051
Data columns (total 8 columns):
App Name            267034 non-null object
Category_g          267034 non-null object
Rating_g            267035 non-null object
Reviews_g           267034 non-null object
Installs_g          267035 non-null object
Size_g              267035 non-null object
Price_g             267035 non-null object
Content Rating_g    267035 non-null object
dtypes: object(8)
memory usage: 18.3+ MB


In [4]:
app_df=google_renamed.merge(final_apple_df, how="inner", on="App Name")

In [5]:
app_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1435 entries, 0 to 1434
Data columns (total 13 columns):
App Name              1435 non-null object
Category_g            1435 non-null object
Rating_g              1435 non-null object
Reviews_g             1435 non-null object
Installs_g            1435 non-null object
Size_g                1435 non-null object
Price_g               1435 non-null object
Content Rating_g      1435 non-null object
Size_a                1435 non-null int64
user_rating_a         1435 non-null float64
rating_count_tot_a    1435 non-null int64
price_a               1435 non-null float64
cont_rating_a         1435 non-null object
dtypes: float64(2), int64(2), object(9)
memory usage: 157.0+ KB


In [7]:
app_df.head()

Unnamed: 0,App Name,Category_g,Rating_g,Reviews_g,Installs_g,Size_g,Price_g,Content Rating_g,Size_a,user_rating_a,rating_count_tot_a,price_a,cont_rating_a
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034,"5,000,000+",Varies with device,0,Everyone,100554752,4.5,25947,0.0,4+
1,Allrecipes Dinner Spinner,FOOD_AND_DRINK,4.545353413,67514,"5,000,000+",Varies with device,0,Everyone,36399104,3.5,109349,0.0,12+
2,Domino's Pizza USA,FOOD_AND_DRINK,4.739675045,1177040,"10,000,000+",Varies with device,0,Everyone,105743360,5.0,258624,0.0,4+
3,Chick-fil-A,FOOD_AND_DRINK,4.374690533,52526,"5,000,000+",19M,0,Everyone,96230400,3.5,5665,0.0,4+
4,Amazon Prime Now,SHOPPING,3.924423456,24849,"10,000,000+",38M,0,Teen,48007168,3.0,895,0.0,4+
