<a href="https://colab.research.google.com/github/dweizzz/datathon23/blob/main/AryaLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np
import matplotlib as plot
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from functools import reduce

#Dataframes

In [5]:
#preprocessing
df_immunizations = pd.read_csv('Immunisations.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_immunizations.drop(to_drop, inplace=True, axis=1)

In [52]:
#physicians dataframe
df_doc = pd.read_csv('Physicians.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_doc.drop(to_drop, inplace=True, axis=1)
df_doc = df_doc[df_doc['Variable'].str.contains('Practising physicians')]
df_doc = pd.merge(left=df_immunizations, right=df_doc, how='right', left_on=['Year','Country'], right_on=['Year','Country'])
#merged = merged.rename(columns={'Value_x': 'percent_immun', 'Value_y': 'num_physicians'})
#merged = merged.drop(['Variable_x', 'Value_y'], axis=1)
df_doc = df_doc.drop(['Variable_x', 'Variable_y'], axis=1)
df_doc = df_doc.rename(columns={'Value_x': 'Immunization', 'Value_y': 'Number Physicians'})
df_doc

Unnamed: 0,Country,Year,Immunization,Number Physicians
0,Australia,2011,91.6,74100.0
1,Australia,2011,92.0,74100.0
2,Australia,2011,94.0,74100.0
3,Australia,2012,91.8,75258.0
4,Australia,2012,92.2,75258.0
...,...,...,...,...
2727,Peru,2016,88.0,40352.0
2728,Peru,2016,88.6,40352.0
2729,Peru,2018,84.0,43805.0
2730,Peru,2018,85.0,43805.0


In [51]:
#hospitals dataframe
df_hospitals = pd.read_csv('Hospitals.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_hospitals.reset_index(inplace=True)  # Reset the index to a simple integer index
df_hospitals=df_hospitals[['index','Country','Year','Variable','Value']]
df_hospitals = df_hospitals.groupby(['Country', 'Year'])['Value'].sum().reset_index()
df_hospitals = df_hospitals.rename(columns={'Value': 'Number Hospitals'})
df_hospitals

Unnamed: 0,Country,Year,Number Hospitals
0,Australia,2010,3842.85
1,Australia,2011,3842.65
2,Australia,2012,3838.75
3,Australia,2013,3863.11
4,Australia,2014,3724.15
...,...,...,...
382,United States,2015,15631.58
383,United States,2016,15564.03
384,United States,2017,17045.28
385,United States,2018,16869.46


In [17]:
df_medwage = pd.read_csv('MedianWage.csv')
df_medwage=df_medwage[['Country','Time','Unit Code','Value']]
df_medwage = df_medwage.rename(columns={'Value': 'Medium Wage','Time':'Year'})
df_medwage

Unnamed: 0,Country,Year,Unit Code,Medium Wage
0,Australia,2000,AUD,46246.868731
1,Australia,2001,AUD,48315.982391
2,Australia,2002,AUD,50052.758102
3,Australia,2003,AUD,51798.586644
4,Australia,2004,AUD,54199.402711
...,...,...,...,...
2306,Türkiye,2016,USD,30653.260715
2307,Türkiye,2017,USD,30076.378939
2308,Türkiye,2018,USD,30610.715156
2309,Türkiye,2019,USD,32625.050808


In [54]:
df_PrevDeaths = pd.read_csv('PreventableDeaths.csv')
df_PrevDeaths = df_PrevDeaths[df_PrevDeaths['Measure'] == 'Deaths per 100 000 population (standardised rates)']
df_PrevDeaths = df_PrevDeaths[df_PrevDeaths['Variable'] == 'Preventable mortality']
df_PrevDeaths=df_PrevDeaths[['Country','Year','Value']]
df_PrevDeaths = df_PrevDeaths.rename(columns={'Value': 'Number Preven Deaths'})
df_PrevDeaths

Unnamed: 0,Country,Year,Number Preven Deaths
356,Brazil,2010,225
357,Brazil,2011,224
358,Brazil,2012,221
359,Brazil,2013,219
360,Brazil,2014,214
...,...,...,...
7870,Romania,2015,260
7871,Romania,2016,257
7872,Romania,2017,254
7873,Romania,2018,254


#Combining Variable Dataframes

In [55]:
dfs = [df_doc, df_hospitals, df_medwage,df_PrevDeaths]

#merge all DataFrames into one
all_vars = reduce(lambda left, right: pd.merge(left, right, on=['Country', 'Year'], how='inner'), dfs)
all_vars

Unnamed: 0,Country,Year,Immunization,Number Physicians,Number Hospitals,Unit Code,Medium Wage,Number Preven Deaths
0,Australia,2011,91.6,74100.00,3842.65,AUD,72978.599187,109
1,Australia,2011,91.6,74100.00,3842.65,AUD,85380.702985,109
2,Australia,2011,91.6,74100.00,3842.65,USD,54341.632161,109
3,Australia,2011,92.0,74100.00,3842.65,AUD,72978.599187,109
4,Australia,2011,92.0,74100.00,3842.65,AUD,85380.702985,109
...,...,...,...,...,...,...,...,...
5575,Lithuania,2020,91.4,4.48,304.15,EUR,20587.013196,285
5576,Lithuania,2020,91.4,4.48,304.15,USD,39610.596161,285
5577,Lithuania,2020,91.4,4.48,304.15,EUR,19688.950141,285
5578,Lithuania,2020,91.4,4.48,304.15,EUR,20587.013196,285


#Linear Regression Model

In [62]:
country_specific = all_vars[all_vars['Country'] == 'Lithuania']

y = country_specific['Immunization']
X = country_specific[['Year', 'Number Physicians','Number Hospitals','Medium Wage','Number Preven Deaths']]
X['Number Physicians Sqrt'] = np.sqrt(X['Number Physicians'])
#X['Num_Physicians_Sq'] = country_specific['Num_Physicians'] ** 2
X['Number Hospitals Sqrt'] = np.sqrt(X['Number Hospitals'])
#X['Number Preven Deaths Sq'] = country_specific['Number Preven Deaths'] ** 2
X['Number Preven Deaths Sqrt'] = np.sqrt(X['Number Preven Deaths'])

# Add a constant term to the independent variables to represent the intercept
X = sm.add_constant(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the OLS model
model = sm.OLS(y_train, X_train)

# Train the model using the fit method
results = model.fit()

# Predict values for the test set
y_pred = results.predict(X_test)

# Get the model summary
summary = results.summary()

# Print the model summary
print(summary)

                            OLS Regression Results                            
Dep. Variable:           Immunization   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.026
Method:                 Least Squares   F-statistic:                    0.3252
Date:                Sun, 30 Apr 2023   Prob (F-statistic):              0.956
Time:                        14:14:49   Log-Likelihood:                -1053.6
No. Observations:                 211   AIC:                             2125.
Df Residuals:                     202   BIC:                             2155.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Number Physicians Sqrt'] = np.sqrt(X['Number Physicians'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Number Hospitals Sqrt'] = np.sqrt(X['Number Hospitals'])


In [None]:
q1 = country_specific.quantile(0.25)
q3 = country_specific.quantile(0.75)
iqr = q3 - q1
outliers = ((country_specific < (q1 - 1.5 * iqr)) | (country_specific > (q3 + 1.5 * iqr))).sum()
print('Outliers in the data:\n', outliers)

Outliers in the data:
 Country           0
Immunization      2
Num_Hospitals     0
Num_Physicians    0
Year              0
dtype: int64


  q1 = country_specific.quantile(0.25)
  q3 = country_specific.quantile(0.75)
  outliers = ((country_specific < (q1 - 1.5 * iqr)) | (country_specific > (q3 + 1.5 * iqr))).sum()
