# Practical Hands-On Assessment
#### Gilang Amarullah

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Dataset
df_a = pd.read_csv(r'data_input\analytics.csv', header=5, nrows = 11)
df_b = pd.read_csv(r'data_input\analytics.csv', header=17, index_col=0)

In [3]:
df_a.head()

Unnamed: 0,Language,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Goal Conversion Rate,Goal Completions,Goal Value
0,en-us,23497,22696,35113,0.54%,6.35,00:02:32,27.21%,9555,$0.00
1,id-id,7797,7613,10617,0.65%,4.89,00:01:38,16.26%,1726,$0.00
2,en-gb,3198,3086,4625,0.56%,5.52,00:01:50,21.36%,988,$0.00
3,id,2171,2094,2724,0.29%,4.75,00:01:43,17.18%,468,$0.00
4,en,224,214,295,3.05%,5.03,00:01:44,26.78%,79,$0.00


In [4]:
df_b.head()

Unnamed: 0_level_0,Users
Day Index,Unnamed: 1_level_1
1/1/19,122
1/2/19,174
1/3/19,240
1/4/19,231
1/5/19,172


In [5]:
# Converting Language

language = []

for x in df_a.Language:
    if pd.notna(x) == True:
        if len(str(x)) > 2:
            language.append(str(x)[0:2])
        else:
            language.append(str(x))
    else:
        language.append('missing')

df_a['Language'] = language
df_a.head()

Unnamed: 0,Language,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Goal Conversion Rate,Goal Completions,Goal Value
0,en,23497,22696,35113,0.54%,6.35,00:02:32,27.21%,9555,$0.00
1,id,7797,7613,10617,0.65%,4.89,00:01:38,16.26%,1726,$0.00
2,en,3198,3086,4625,0.56%,5.52,00:01:50,21.36%,988,$0.00
3,id,2171,2094,2724,0.29%,4.75,00:01:43,17.18%,468,$0.00
4,en,224,214,295,3.05%,5.03,00:01:44,26.78%,79,$0.00


## Questions

#### Question 7
Download analytics.csv, which is export as-is from the company's Google Analytics dashboard. Values in the Language column is formatted to capture both the client (browser) language and keyboard language, but for this exercise we're only interested about the former. A value of en-id should hence be stored as en, and a value of id-jp should similarly be id. Fill missing values with missing. This should result in en, id, th and missing as valid values in the Language column. Which language has on average, the highest Pages / Session count?

##### answer: en

In [6]:
# Finding the highest of average Pages/Sessions per Language
df_a.groupby('Language')['Pages / Session'].mean().sort_values(ascending=False)

Language
en         5.961667
missing    5.860000
id         4.426667
th         3.500000
Name: Pages / Session, dtype: float64

#### Question 8
Use any tools of your choice, run a closed-form, simple linear regression to predict Goal Conversion Rate (target) using the values of Pages / Session (predictor). Call this model_A. What is the multiple R-squared from your simple linear regression, model_A, rounded to 3 decimal points? You can retrieve this value through sklearn.metrics.r2_score or summary(model)$r.squared

#### answer: 0.865

In [7]:
# Converting Goal Conversion Rate to Float

gcr = [] # GCR a.k.a Goal Conversion Rate

for x in df_a['Goal Conversion Rate']:
    x = float(str(x).replace('%',''))
    gcr.append(x)

In [8]:
y = np.array(gcr).reshape(-1,1)
x1 = np.array(df_a['Pages / Session']).reshape(-1,1)

In [9]:
x1_train, x1_test, y_train, y_test = train_test_split(x1, y, test_size=0.25, random_state=42)

In [10]:
model_A = LinearRegression()
model_A.fit(x1_train, y_train)

LinearRegression()

In [11]:
y_pred_1 = model_A.predict(x1_test)

In [12]:
# R-squared
model_A_r2 = round(metrics.r2_score(y_test, y_pred_1), 3)
print('r2:', model_A_r2)

model_A_r2_adj = 1 - ( 1-model_A.score(x1, y) ) * ( len(y) - 1 ) / ( len(y) - x1.shape[1] - 1)
print('adjusted r2:', model_A_r2_adj)

r2: 0.865
adjusted r2: 0.8407252260649404


#### Question 9
Let beta0 be the intercept and beta1 be your slope. What is the value of beta0?

#### answer: -24.526
_closest to -25.188_

In [13]:
model_A_intercept = round(float(model_A.intercept_), 3)
print('intercept: ', model_A_intercept)

intercept:  -24.526


#### Question 10
Add Language as an additional predictor to the earlier linear regression model. Call this model_B. Did your multiple R-squared model improved as a result? Compare the adjusted R-squared of two models model_A and model_B.

#### answer: model_A has a higher multiple R2 and adjusted R2 value

In [14]:
# splitting Language into separate boolean variable
language_dummy = pd.get_dummies(df_a['Language'], prefix='lang')

df = pd.concat([df_a['Pages / Session'], language_dummy], axis=1)
df.head()

Unnamed: 0,Pages / Session,lang_en,lang_id,lang_missing,lang_th
0,6.35,1,0,0,0
1,4.89,0,1,0,0
2,5.52,1,0,0,0
3,4.75,0,1,0,0
4,5.03,1,0,0,0


In [15]:
x2 = np.array(df)

In [16]:
x2_train, x2_test, y_train, y_test = train_test_split(x2, y, test_size=0.25, random_state=42)

In [17]:
model_B = LinearRegression()
model_B.fit(x2_train, y_train)

LinearRegression()

In [18]:
y_pred_2 = model_B.predict(x2_test)

In [19]:
# R-squared
model_B_r2 = round(metrics.r2_score(y_test, y_pred_2), 3)
print('r2:', model_B_r2)

model_B_r2_adj = 1 - ( 1-model_B.score(x2, y) ) * ( len(y) - 1 ) / ( len(y) - x2.shape[1] - 1)
print('adjusted r2:', model_B_r2_adj)

r2: 0.732
adjusted r2: 0.6890157536947283


In [20]:
# Comparing r2 and adjusted r2 of the two models

print('r2')
print('Model A:', model_A_r2)
print('Model B:', model_B_r2)

print('=========================')

print('Adjusted r2')
print('Model A:', model_A_r2_adj)
print('Model B:', model_B_r2_adj)

r2
Model A: 0.865
Model B: 0.732
Adjusted r2
Model A: 0.8407252260649404
Model B: 0.6890157536947283
