In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from config import db_password
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf


In [2]:
#create connection to database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/DbFinalProject"
engine = create_engine(db_string)

In [3]:
#import googleplaystore data
df = pd.read_sql_table('googleplaystore_data', con=engine)
df.head()


Unnamed: 0,app_id,app_name,category,app_type,content_rating,genres,sub_genre,rating,reviews,app_size,installs,price,last_updated
0,A1479,Learn SQL,EDUCATION,Free,Everyone,Education,0,4.7,19277,5.3,1000000,0.0,2017-12-25
1,A2160,CX-OF,FAMILY,Free,Everyone,Entertainment,0,4.2,18,37.0,1000,0.0,2018-01-31
2,A2907,FI CFL,FINANCE,Free,Everyone,Finance,0,3.7,112,3.9,10000,0.0,2018-07-05
3,A3213,Quiz DC,GAME,Free,Everyone,Trivia,0,1.4,33,3.1,1000,0.0,2017-12-25
4,A4315,I AM C.T.,HEALTH_AND_FITNESS,Free,Mature 17+,Health & Fitness,0,4.6,28,20.0,1000,0.0,2017-04-27


In [4]:
df_encode = df.drop(columns=["app_id","app_name","last_updated", "price"])
df_encode.head()

Unnamed: 0,category,app_type,content_rating,genres,sub_genre,rating,reviews,app_size,installs
0,EDUCATION,Free,Everyone,Education,0,4.7,19277,5.3,1000000
1,FAMILY,Free,Everyone,Entertainment,0,4.2,18,37.0,1000
2,FINANCE,Free,Everyone,Finance,0,3.7,112,3.9,10000
3,GAME,Free,Everyone,Trivia,0,1.4,33,3.1,1000
4,HEALTH_AND_FITNESS,Free,Mature 17+,Health & Fitness,0,4.6,28,20.0,1000


In [5]:
le = LabelEncoder()
df2 = df_encode.copy()
df2['app_type'] = le.fit_transform(df2['app_type'])
df2['category'] = le.fit_transform(df2['category'])
df2['content_rating'] = le.fit_transform(df2['content_rating'])
df2['genres'] = le.fit_transform(df2['genres'])


df2.head()

Unnamed: 0,category,app_type,content_rating,genres,sub_genre,rating,reviews,app_size,installs
0,8,0,1,15,0,4.7,19277,5.3,1000000
1,11,0,1,17,0,4.2,18,37.0,1000
2,12,0,1,19,0,3.7,112,3.9,10000
3,14,0,1,44,0,1.4,33,3.1,1000
4,15,0,3,21,0,4.6,28,20.0,1000


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728 entries, 0 to 7727
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category        7728 non-null   int32  
 1   app_type        7728 non-null   int32  
 2   content_rating  7728 non-null   int32  
 3   genres          7728 non-null   int32  
 4   sub_genre       7728 non-null   int64  
 5   rating          7728 non-null   float64
 6   reviews         7728 non-null   int64  
 7   app_size        7728 non-null   float64
 8   installs        7728 non-null   int64  
dtypes: float64(2), int32(4), int64(3)
memory usage: 422.8 KB


In [7]:
df2.describe()

Unnamed: 0,category,app_type,content_rating,genres,sub_genre,rating,reviews,app_size,installs
count,7728.0,7728.0,7728.0,7728.0,7728.0,7728.0,7728.0,7728.0,7728.0
mean,16.552666,0.074922,1.473602,23.914984,0.05396,4.174237,294710.7,22.956748,8418823.0
std,8.130285,0.263283,1.010276,13.345055,0.225953,0.543546,1863345.0,23.447283,50141610.0
min,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.008301,1.0
25%,11.0,0.0,1.0,14.0,0.0,4.0,108.0,5.3,10000.0
50%,14.0,0.0,1.0,24.0,0.0,4.3,2330.0,14.0,100000.0
75%,24.0,0.0,1.0,36.25,0.0,4.5,38980.25,33.0,1000000.0
max,32.0,1.0,5.0,47.0,1.0,5.0,44893890.0,100.0,1000000000.0


In [8]:
# Split our preprocessed data into our features and target arrays
y = df2["rating"]
X = df2.drop(columns="rating")

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1,
                                                    stratify=y)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train.shape
y_train.shape

(5796,)

In [29]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 8)                 72        
                                                                 
 dense_16 (Dense)            (None, 5)                 45        
                                                                 
 dense_17 (Dense)            (None, 1)                 6         
                                                                 
Total params: 123
Trainable params: 123
Non-trainable params: 0
_________________________________________________________________


In [30]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [33]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200


Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200


Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [32]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

61/61 - 0s - loss: -7.3677e+11 - accuracy: 0.0021 - 174ms/epoch - 3ms/step
Loss: -736772096000.0, Accuracy: 0.0020703934133052826
