In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
transit = pd.read_csv("Data/transit_income.csv")
transit.drop('Unnamed: 0', axis=1, inplace=True)
transit.head()

Unnamed: 0,country,city,phase,start_year,end_year,rr,length,tunnelper,tunnel,elevated,...,anglo,real_cost_2021,cost_km_2021,country_name,Region,IncomeGroup,2021,rule_of_law,no_corruption,enforcement
0,TR,Istanbul,M4 Phase 3,2015,2022,0.0,7.5,1.0,7.5,0.0,...,0.0,550.341886,73.378918,Turkiye,Europe & Central Asia,Upper middle income,9661.227734,0.42,0.46,0.41
1,CA,Vancouver,Broadway,2020,2025,0.0,5.7,0.877193,5.0,0.295,...,1.0,2565.24065,450.042219,Canada,North America,High income,52358.62164,0.8,0.82,0.8
2,CA,Toronto,Vaughan,2009,2017,0.0,8.6,1.0,8.6,0.0,...,1.0,3014.952287,350.575847,Canada,North America,High income,52358.62164,0.8,0.82,0.8
3,CA,Toronto,Scarborough,2020,2030,0.0,7.8,1.0,7.8,0.0,...,1.0,4985.450027,639.16026,Canada,North America,High income,52358.62164,0.8,0.82,0.8
4,CA,Toronto,Ontario,2020,2030,0.0,15.5,0.57,8.8,5.76,...,1.0,7632.655004,492.429355,Canada,North America,High income,52358.62164,0.8,0.82,0.8


In [2]:
# Check for missing data
print("\nMissing values:\n")
print(transit.isnull().sum())


Missing values:

country                     0
city                        0
phase                       0
start_year                  0
end_year                    0
rr                          0
length                      0
tunnelper                   0
tunnel                      0
elevated                  352
atgrade                   354
stations                    0
platform_length_meters    429
anglo                       0
real_cost_2021              0
cost_km_2021                0
country_name                0
Region                      0
IncomeGroup                 0
2021                        0
rule_of_law                34
no_corruption              34
enforcement                34
dtype: int64


The following variables might need a reduction
'elevated', 'atgrade', 'platform_length_meters', 'rule_of_law', 'no_corruption', 'enforcement'
'real_cost_2021', 'anglo'

Let us look at each variable.
We can only cluster for indegers. 

In [3]:
# regression preprocessing

from sklearn.preprocessing import LabelEncoder

df = transit
cols_cate = ['country', 'city', 'Region', 'IncomeGroup']
le = LabelEncoder()
for col in cols_cate:
    df[col] = le.fit_transform(df[col])


In [4]:
df.columns

Index(['country', 'city', 'phase', 'start_year', 'end_year', 'rr', 'length',
       'tunnelper', 'tunnel', 'elevated', 'atgrade', 'stations',
       'platform_length_meters', 'anglo', 'real_cost_2021', 'cost_km_2021',
       'country_name', 'Region', 'IncomeGroup', '2021', 'rule_of_law',
       'no_corruption', 'enforcement'],
      dtype='object')

In [5]:
df[['start_year', 'end_year', 'rr', 'length',
       'tunnelper', 'tunnel', 'elevated', 'atgrade', 'stations',
       'platform_length_meters', 'anglo', '2021', 'rule_of_law',
       'no_corruption', 'enforcement']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 835 entries, 0 to 834
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   start_year              835 non-null    int64  
 1   end_year                835 non-null    int64  
 2   rr                      835 non-null    float64
 3   length                  835 non-null    float64
 4   tunnelper               835 non-null    float64
 5   tunnel                  835 non-null    float64
 6   elevated                483 non-null    float64
 7   atgrade                 481 non-null    float64
 8   stations                835 non-null    float64
 9   platform_length_meters  406 non-null    object 
 10  anglo                   835 non-null    float64
 11  2021                    835 non-null    float64
 12  rule_of_law             801 non-null    float64
 13  no_corruption           801 non-null    float64
 14  enforcement             801 non-null    fl

In [6]:
import statsmodels.api as sm

# Define your independent variable (X) and dependent variable (y)
df = transit
df = df.dropna()

X = df[['start_year', 'end_year', 'rr', 'length',
       'tunnelper', 'tunnel', 'elevated', 'atgrade', 'stations', 'anglo', '2021', 'rule_of_law',
       'no_corruption', 'enforcement']]
y = df['cost_km_2021']

# Add a constant to the independent value
X = sm.add_constant(X)

# Conduct the linear regression
model = sm.OLS(y, X)
results = model.fit()

# Print out the statistics
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:           cost_km_2021   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.548
Method:                 Least Squares   F-statistic:                     9.494
Date:                Mon, 17 Jul 2023   Prob (F-statistic):           3.53e-12
Time:                        14:44:16   Log-Likelihood:                -680.10
No. Observations:                  99   AIC:                             1390.
Df Residuals:                      84   BIC:                             1429.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          -361.4298   6147.878     -0.059

In [7]:
transit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 835 entries, 0 to 834
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 835 non-null    int64  
 1   city                    835 non-null    int64  
 2   phase                   835 non-null    object 
 3   start_year              835 non-null    int64  
 4   end_year                835 non-null    int64  
 5   rr                      835 non-null    float64
 6   length                  835 non-null    float64
 7   tunnelper               835 non-null    float64
 8   tunnel                  835 non-null    float64
 9   elevated                483 non-null    float64
 10  atgrade                 481 non-null    float64
 11  stations                835 non-null    float64
 12  platform_length_meters  406 non-null    object 
 13  anglo                   835 non-null    float64
 14  real_cost_2021          835 non-null    fl

In [9]:
# Import the necessary packages
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder


# One-hot encode the categorical variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(transit[['country', 'city', 'Region', 'IncomeGroup']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['country', 'city', 'Region', 'IncomeGroup']))

# Merge the encoded features with the rest of the DataFrame
transit_encoded = pd.concat([transit.drop(['country', 'city', 'Region', 'IncomeGroup'], axis=1), encoded_df], axis=1)
transit_encoded.fillna(transit_encoded.mean(), inplace=True)
# Define predictor variables and target variable
X = transit_encoded.drop(['cost_km_2021', 'phase', 'platform_length_meters','country_name' ], axis=1)
y = transit_encoded['cost_km_2021']

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Apply Ridge regression model
ridge = Ridge(alpha=1.0)

# Fit the model
ridge.fit(X_train, y_train)

# Make predictions using the test set
y_pred = ridge.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)

mse


  transit_encoded.fillna(transit_encoded.mean(), inplace=True)


49675.521807620215