In [2]:
import numpy as np
import pandas as pd

In [3]:
import pymysql
from sqlalchemy import create_engine

import getpass 

In [4]:
password = getpass.getpass()

········


# 1. Import data

In [14]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/project_week_5'
engine = create_engine(connection_string)
carpark = pd.read_sql_query('SELECT merk, reg_date, CO2_emissions FROM carpark', engine)
carpark

Unnamed: 0,merk,reg_date,CO2_emissions
0,NISSAN,2005-03-17,187.000000
1,OPEL,2006-01-05,187.000000
2,RENAULT,2004-09-30,145.000000
3,RENAULT,2006-07-28,196.000000
4,RENAULT,2007-03-23,157.428571
...,...,...,...
78989,NISSAN,2019-12-12,131.000000
78990,DACIA,2019-11-23,121.000000
78991,DACIA,2020-01-13,121.000000
78992,RENAULT,2019-12-06,99.000000


In [10]:
allregistrations = pd.read_sql_query('SELECT merk, reg_date, final_CO2 FROM allregistrations', engine)
allregistrations

Unnamed: 0,merk,reg_date,final_CO2
0,RENAULT,2021-11-23,106.0
1,RENAULT,2021-11-23,115.0
2,RENAULT,2020-12-18,0.0
3,RENAULT,2021-04-30,123.0
4,RENAULT,2021-04-29,0.0
...,...,...,...
1066346,RENAULT,2012-06-27,129.0
1066347,RENAULT,2004-02-13,192.0
1066348,RENAULT,2020-11-18,99.0
1066349,RENAULT,2001-07-25,211.0


# 2. Quick Data transformation

For the hypothesis testing, I will use the registration year only, instead of the fully detailed registration date.

In [18]:
allregistrations['reg_year'] = allregistrations['reg_date'].apply(lambda x: str(x)[0:4]).astype(int)
carpark['reg_year'] = carpark['reg_date'].apply(lambda x: str(x)[0:4]).astype(int)

In [19]:
allregistrations

Unnamed: 0,merk,reg_date,final_CO2,reg_year
0,RENAULT,2021-11-23,106.0,2021
1,RENAULT,2021-11-23,115.0,2021
2,RENAULT,2020-12-18,0.0,2020
3,RENAULT,2021-04-30,123.0,2021
4,RENAULT,2021-04-29,0.0,2021
...,...,...,...,...
1066346,RENAULT,2012-06-27,129.0,2012
1066347,RENAULT,2004-02-13,192.0,2004
1066348,RENAULT,2020-11-18,99.0,2020
1066349,RENAULT,2001-07-25,211.0,2001


In [20]:
carpark

Unnamed: 0,merk,reg_date,CO2_emissions,reg_year
0,NISSAN,2005-03-17,187.000000,2005
1,OPEL,2006-01-05,187.000000,2006
2,RENAULT,2004-09-30,145.000000,2004
3,RENAULT,2006-07-28,196.000000,2006
4,RENAULT,2007-03-23,157.428571,2007
...,...,...,...,...
78989,NISSAN,2019-12-12,131.000000,2019
78990,DACIA,2019-11-23,121.000000,2019
78991,DACIA,2020-01-13,121.000000,2020
78992,RENAULT,2019-12-06,99.000000,2019


# 3. Hypothesis Testing

My hypothesis is that our carpark is cleaner than the full automotive market. Which would mean that the average CO2 emissions of our car park for a given year should be lower than the average emission of all registrations.

H1 is thus : average emissions of carpark < average emissions of allregistratrions

H0 becomes : average emissions of carpark >= average emissions of allregistratrions

To test this hypothesis, I will first calculate the average emissions of allregistrations for the chosen year. This will become the reference for the testing of the hypothesis on carpark.

In [21]:
average_allregistrations = allregistrations.groupby('reg_year').agg({'final_CO2':np.mean})
average_allregistrations

Unnamed: 0_level_0,final_CO2
reg_year,Unnamed: 1_level_1
2000,169.00979
2001,168.057182
2002,167.475038
2003,171.036581
2004,174.972009
2005,174.546009
2006,171.028892
2007,172.724291
2008,165.354318
2009,158.790348


In [22]:
test_year = 2021
carpark_sample = carpark[carpark['reg_year'] == test_year]
carpark_sample.dtypes

merk                     object
reg_date         datetime64[ns]
CO2_emissions           float64
reg_year                  int32
dtype: object

The CO2_emissions are a float, we can proceed.

In [23]:
from scipy.stats import ttest_1samp
stat, pval = ttest_1samp(carpark_sample['CO2_emissions'], average_allregistrations['final_CO2'][test_year])
print('stat is  ', stat)         
print('pvalue for this one-tailed test is ', pval/2)

stat is   9.083520289676315
pvalue for this one-tailed test is  6.557751045336606e-20


The very low p value indicates that the average emissions of our carpark are significantly different than the value for all registrations. Stat is positive, which means that our average is probably higher than the one for the "full market".

We can now easily test the hypothesis for different years.

In [24]:
for test_year_loop in range(2017, 2021):
    carpark_sample_loop = carpark[carpark['reg_year'] == test_year_loop]
    stat_loop, pval_loop = ttest_1samp(carpark_sample_loop['CO2_emissions'], average_allregistrations['final_CO2'][test_year_loop])
    print('for ', test_year_loop, ' stat is  ', stat_loop, ' and pvalue is ', pval_loop/2)
#     print('\n')

for  2017  stat is   10.592632108077593  and pvalue is  2.107951154210001e-26
for  2018  stat is   12.895591420526927  and pvalue is  3.8661183811904194e-38
for  2019  stat is   6.745211518585458  and pvalue is  7.999658731179001e-12
for  2020  stat is   5.157387059324169  and pvalue is  1.2791430442262726e-07


We see that for all the tested years, the emissions of our car park are signficantly different than the overall registrations average and that our car park is always higher.

# 4. Second hypothesis testing on brand level

In [25]:
for brand in ['RENAULT', 'DACIA', 'NISSAN']:
    print(brand)
    average_brand = allregistrations[allregistrations['merk']==brand].groupby('reg_year').agg({'final_CO2':np.mean})
    
    for test_year_loop in range(2017, 2022):
        carpark_sample_loop = carpark[(carpark['reg_year'] == test_year_loop) & (carpark['merk'] == brand)]
        stat_loop, pval_loop = ttest_1samp(carpark_sample_loop['CO2_emissions'], average_brand['final_CO2'][test_year_loop])
        print('for ', test_year_loop, ' stat is  ', stat_loop, ' and pvalue is ', pval_loop/2)
    
    print('\n')

RENAULT
for  2017  stat is   14.977584561292812  and pvalue is  2.524695799615929e-50
for  2018  stat is   11.744536439850608  and pvalue is  6.184437145380292e-32
for  2019  stat is   2.034875919163297  and pvalue is  0.020949111953840683
for  2020  stat is   4.279202352627834  and pvalue is  9.53682631989386e-06
for  2021  stat is   10.703534600311379  and pvalue is  9.0210625676825e-27


DACIA
for  2017  stat is   -6.326118302579882  and pvalue is  1.6901304381378372e-10
for  2018  stat is   -4.542362398627365  and pvalue is  2.996094822751926e-06
for  2019  stat is   -6.456563180568196  and pvalue is  8.748029957608915e-11
for  2020  stat is   -4.0766581492564  and pvalue is  2.51986283481157e-05
for  2021  stat is   -6.343581737941242  and pvalue is  1.6241932524330893e-10


NISSAN
for  2017  stat is   -0.34612488917082457  and pvalue is  0.36464102344175664
for  2018  stat is   4.87295647204134  and pvalue is  5.824686568191436e-07
for  2019  stat is   9.169958347314163  and pval

Based on the p values, we see that there is only one brand/year combination for which our average emissions are in line with the full registrations : it is Nissan in 2017.

For the rest, the emissions of our car park are always above the overall average for Renault and Nissan.

For Dacia, however, we see that our emissions are always below the overall average.