In [84]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('analytics.csv', delimiter=';')

# Print the column names to inspect
print(df.columns)

# Extract the client language from the Language column
df['Language'] = df['Language'].str.split('-').str[0]

# Fill missing values with 'missing'
df['Language'].fillna('missing', inplace=True)

# Check if 'Pages / Session' is already numeric before converting
if not pd.api.types.is_numeric_dtype(df['Pages / Session']):
    # Convert 'Pages / Session' column to numeric (remove commas and convert to float)
    df['Pages / Session'] = pd.to_numeric(df['Pages / Session'].str.replace(',', ''), errors='coerce')

# Group by Language and calculate the average Pages / Session
average_pages_per_session = df.groupby('Language')['Pages / Session'].mean()

# Find the language with the highest average Pages / Session
highest_language = average_pages_per_session.idxmax()

print(f"The language with the highest average Pages / Session is: {highest_language}")


Index(['Language', 'Users', 'New Users', 'Sessions', 'Bounce Rate',
       'Pages / Session', 'Avg. Session Duration', 'Goal Conversion Rate',
       'Goal Completions', 'Goal Value'],
      dtype='object')
The language with the highest average Pages / Session is: en


In [44]:
df

Unnamed: 0,Language,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Goal Conversion Rate,Goal Completions,Goal Value
0,en,23497,22696,35113,0.54%,6.35,00:02:32,27.21%,9555,$0.00
1,id,7797,7613,10617,0.65%,4.89,00:01:38,16.26%,1726,$0.00
2,en,3198,3086,4625,0.56%,5.52,00:01:50,21.36%,988,$0.00
3,id,2171,2094,2724,0.29%,4.75,00:01:43,17.18%,468,$0.00
4,en,224,214,295,3.05%,5.03,00:01:44,26.78%,79,$0.00
5,id,204,202,235,1.28%,3.64,00:00:44,4.68%,11,$0.00
6,th,186,186,224,0.45%,3.5,00:00:40,4.91%,11,$0.00
7,en,132,125,231,0.00%,6.92,00:02:10,39.83%,92,$0.00
8,en,126,119,171,0.00%,6.39,00:04:10,26.32%,45,$0.00
9,en,93,91,127,0.00%,5.56,00:01:22,19.69%,25,$0.00


In [63]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming df is your DataFrame with the data

# Convert 'Goal Conversion Rate' to numeric, removing the percentage sign
df['Goal Conversion Rate'] = pd.to_numeric(df['Goal Conversion Rate'].str.rstrip('%'), errors='coerce')

# Select predictor and target variables
X = df[['Pages / Session']]
y = df['Goal Conversion Rate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model_A = LinearRegression()
model_A.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_A.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)

# Print the R-squared value rounded to 3 decimal points
print(f"R-squared for model_A: {round(r_squared, 3)}")


R-squared for model_A: 0.865


In [64]:
# Retrieve the intercept (beta0) from the linear regression model
intercept_value = model_A.intercept_

# Print the intercept value
print(f"The intercept (beta0) is: {intercept_value}")


The intercept (beta0) is: -24.52647906146968


In [66]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.14.1-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Downloading patsy-0.5.5-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.1-cp311-cp311-win_amd64.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.9 MB 660.6 kB/s eta 0:00:15
    --------------------------------------- 0.2/9.9 MB 1.5 MB/s eta 0:00:07
   - -------------------------------------- 0.3/9.9 MB 2.1 MB/s eta 0:00:05
   - -------------------------------------- 0.4/9.9 MB 2.2 MB/s eta 0:00:05
   --- ------------------------------------ 0.9/9.9 MB 3.9 MB/s eta 0:00:03
   ------ --------------------------------- 1.6/9.9 MB 5.7 MB/s eta 0:00:02
   --------- ------------------------------ 2.2/9.9 MB 7.2 MB/s eta 0:00:02
   ----------- ---------------------------- 2


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [73]:
print(df.columns)
df

Index(['Users', 'New Users', 'Sessions', 'Bounce Rate', 'Pages / Session',
       'Avg. Session Duration', 'Goal Conversion Rate', 'Goal Completions',
       'Goal Value', 'Language_id', 'Language_missing', 'Language_th'],
      dtype='object')


Unnamed: 0,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Goal Conversion Rate,Goal Completions,Goal Value,Language_id,Language_missing,Language_th
0,23497,22696,35113,0.54%,6.35,00:02:32,27.21,9555,$0.00,False,False,False
1,7797,7613,10617,0.65%,4.89,00:01:38,16.26,1726,$0.00,True,False,False
2,3198,3086,4625,0.56%,5.52,00:01:50,21.36,988,$0.00,False,False,False
3,2171,2094,2724,0.29%,4.75,00:01:43,17.18,468,$0.00,True,False,False
4,224,214,295,3.05%,5.03,00:01:44,26.78,79,$0.00,False,False,False
5,204,202,235,1.28%,3.64,00:00:44,4.68,11,$0.00,True,False,False
6,186,186,224,0.45%,3.5,00:00:40,4.91,11,$0.00,False,False,True
7,132,125,231,0.00%,6.92,00:02:10,39.83,92,$0.00,False,False,False
8,126,119,171,0.00%,6.39,00:04:10,26.32,45,$0.00,False,False,False
9,93,91,127,0.00%,5.56,00:01:22,19.69,25,$0.00,False,False,False


In [78]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming df is your DataFrame with the data

# Convert 'Goal Conversion Rate' to numeric, removing the percentage sign
df['Goal Conversion Rate'] = pd.to_numeric(df['Goal Conversion Rate'].str.rstrip('%'), errors='coerce')

# Select predictors and target variable
X = df[['Pages / Session', 'Language']]
y = df['Goal Conversion Rate']

# One-hot encode the 'Language' variable
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Language'])], remainder='passthrough')
X = pd.DataFrame(ct.fit_transform(X))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model with multiple predictors
model_B = LinearRegression()
model_B.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_B.predict(X_test)

# Calculate R-squared for the model with multiple predictors
r_squared = r2_score(y_test, y_pred)

# Print the R-squared value rounded to 3 decimal points
print(f"R-squared for model_B (with Language): {round(r_squared, 3)}")


R-squared for model_B (with Language): 0.732


In [79]:
# Retrieve the intercept (beta0) from the linear regression model
intercept_value = model_B.intercept_

# Print the intercept value
print(f"The intercept (beta0) is: {intercept_value}")


The intercept (beta0) is: -15.304748930887037


In [82]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming df is your DataFrame with the data

# Convert 'Goal Conversion Rate' to numeric, removing the percentage sign
df['Goal Conversion Rate'] = pd.to_numeric(df['Goal Conversion Rate'].str.rstrip('%'), errors='coerce')

# Select predictor and target variables for model_A
X_A = df[['Pages / Session']]
y_A = df['Goal Conversion Rate']

# Split the data into training and testing sets for model_A
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, random_state=42)

# Create and fit the linear regression model for model_A
model_A = LinearRegression()
model_A.fit(X_train_A, y_train_A)

# Make predictions on the test set for model_A
y_pred_A = model_A.predict(X_test_A)

# Calculate R-squared for model_A
r_squared_A = r2_score(y_test_A, y_pred_A)

# Print the R-squared value for model_A rounded to 3 decimal points
print(f"R-squared for model_A: {round(r_squared_A, 3)}")

# Select predictors and target variables for model_B
X_B = df[['Pages / Session', 'Language']]
y_B = df['Goal Conversion Rate']

# One-hot encode the 'Language' variable for model_B
X_B = pd.get_dummies(X_B, columns=['Language'], drop_first=True)

# Split the data into training and testing sets for model_B
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Create and fit the linear regression model with multiple predictors for model_B
model_B = LinearRegression()
model_B.fit(X_train_B, y_train_B)

# Make predictions on the test set for model_B
y_pred_B = model_B.predict(X_test_B)

# Calculate R-squared for model_B
r_squared_B = r2_score(y_test_B, y_pred_B)

# Print the R-squared value for model_B rounded to 3 decimal points
print(f"R-squared for model_B (with Language): {round(r_squared_B, 3)}")

# Compare the R-squared values
if r_squared_B > r_squared_A:
    print("The multiple R-squared model improved as a result.")
elif r_squared_A > r_squared_B:
    print("The multiple R-squared model did not improve as a result.")
else:
    print("The R-squared values are equal.")


R-squared for model_A: 0.865
R-squared for model_B (with Language): 0.732
The multiple R-squared model did not improve as a result.


In [85]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming df is your DataFrame with the data

# Convert 'Goal Conversion Rate' to numeric, removing the percentage sign
df['Goal Conversion Rate'] = pd.to_numeric(df['Goal Conversion Rate'].str.rstrip('%'), errors='coerce')

# Select predictor and target variables for model_A
X_A = df[['Pages / Session']]
y_A = df['Goal Conversion Rate']

# Split the data into training and testing sets for model_A
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, random_state=42)

# Create and fit the linear regression model for model_A
model_A = LinearRegression()
model_A.fit(X_train_A, y_train_A)

# Make predictions on the test set for model_A
y_pred_A = model_A.predict(X_test_A)

# Calculate R-squared for model_A
r_squared_A = r2_score(y_test_A, y_pred_A)

# Calculate Adjusted R-squared for model_A
n_A = len(y_test_A)  # number of samples for model_A
p_A = 1  # number of predictors for model_A
adjusted_r_squared_A = 1 - (1 - r_squared_A) * ((n_A - 1) / (n_A - p_A - 1))

# Print the R-squared and Adjusted R-squared values for model_A
print(f"R-squared for model_A: {round(r_squared_A, 3)}")
print(f"Adjusted R-squared for model_A: {round(adjusted_r_squared_A, 3)}")

# Select predictors and target variables for model_B
X_B = df[['Pages / Session', 'Language']]
y_B = df['Goal Conversion Rate']

# One-hot encode the 'Language' variable for model_B
X_B = pd.get_dummies(X_B, columns=['Language'], drop_first=True)

# Split the data into training and testing sets for model_B
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, random_state=42)

# Create and fit the linear regression model with multiple predictors for model_B
model_B = LinearRegression()
model_B.fit(X_train_B, y_train_B)

# Make predictions on the test set for model_B
y_pred_B = model_B.predict(X_test_B)

# Calculate R-squared for model_B
r_squared_B = r2_score(y_test_B, y_pred_B)

# Calculate Adjusted R-squared for model_B
n_B, p_B = X_B.shape  # number of samples and predictors for model_B
adjusted_r_squared_B = 1 - (1 - r_squared_B) * ((n_B - 1) / (n_B - p_B - 1))

# Print the R-squared and Adjusted R-squared values for model_B
print(f"R-squared for model_B (with Language): {round(r_squared_B, 3)}")
print(f"Adjusted R-squared for model_B (with Language): {round(adjusted_r_squared_B, 3)}")

# Compare the R-squared and Adjusted R-squared values
if r_squared_B > r_squared_A and adjusted_r_squared_B > adjusted_r_squared_A:
    print("The multiple R-squared and Adjusted R-squared model improved as a result.")
elif r_squared_A > r_squared_B and adjusted_r_squared_A > adjusted_r_squared_B:
    print("The multiple R-squared and Adjusted R-squared model did not improve as a result.")
else:
    print("The R-squared and Adjusted R-squared values are equal.")


R-squared for model_A: 0.865
Adjusted R-squared for model_A: 0.731
R-squared for model_B (with Language): 0.732
Adjusted R-squared for model_B (with Language): 0.554
The multiple R-squared and Adjusted R-squared model did not improve as a result.
