<a href="https://colab.research.google.com/github/dweizzz/datathon23/blob/main/LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import numpy as np
import matplotlib as plot
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [76]:
#preprocessing
df_immunizations = pd.read_csv('Immunisations.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_immunizations.drop(to_drop, inplace=True, axis=1)

In [105]:
#creating a separate dataframe with just the doctors  
df_doc = pd.read_csv('Physicians.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_doc.drop(to_drop, inplace=True, axis=1)
df_doc = df_doc[df_doc['Variable'].str.contains('Practising physicians')]
df_doc = pd.merge(left=df_immunizations, right=df_doc, how='right', left_on=['Year','Country'], right_on=['Year','Country'])
#merged = merged.rename(columns={'Value_x': 'percent_immun', 'Value_y': 'num_physicians'})
#merged = merged.drop(['Variable_x', 'Value_y'], axis=1)
df_doc = df_doc.drop(['Variable_x', 'Variable_y'], axis=1)
df_doc = df_doc.rename(columns={'Value_x': 'Immunization', 'Value_y': 'Num_Physicians'})
df_doc

Unnamed: 0,Country,Year,Immunization,Num_Physicians
0,Australia,2011,91.6,74100.0
1,Australia,2011,92.0,74100.0
2,Australia,2011,94.0,74100.0
3,Australia,2012,91.8,75258.0
4,Australia,2012,92.2,75258.0
...,...,...,...,...
2727,Peru,2016,88.0,40352.0
2728,Peru,2016,88.6,40352.0
2729,Peru,2018,84.0,43805.0
2730,Peru,2018,85.0,43805.0


In [109]:
#preprocessing
df_hospitals = pd.read_csv('Hospitals.csv')
to_drop = ['VAR','UNIT','Measure','COU','YEA', 'Flag Codes','Flags']
df_hospitals.reset_index(inplace=True)  # Reset the index to a simple integer index
df_hospitals=df_hospitals[['index','Country','Year','Variable','Value']]
df_hospitals = df_hospitals.groupby(['Country', 'Year'])['Value'].sum().reset_index()
#df_total.sort_values(["Country", "Year", "Variable"])
df_hospitals = df_hospitals.rename(columns={'Value': 'Num_Hospitals'})
df_hospitals

Unnamed: 0,Country,Year,Num_Hospitals
0,Australia,2010,3842.85
1,Australia,2011,3842.65
2,Australia,2012,3838.75
3,Australia,2013,3863.11
4,Australia,2014,3724.15
...,...,...,...
382,United States,2015,15631.58
383,United States,2016,15564.03
384,United States,2017,17045.28
385,United States,2018,16869.46


In [127]:
all_vars = pd.merge(left=df_doc, right=df_hospitals, how='inner', left_on=['Year','Country'], right_on=['Year','Country'])
all_vars

Unnamed: 0,Country,Year,Immunization,Num_Physicians,Num_Hospitals
0,Australia,2011,91.6,74100.00,3842.65
1,Australia,2011,92.0,74100.00,3842.65
2,Australia,2011,94.0,74100.00,3842.65
3,Australia,2011,91.6,3.32,3842.65
4,Australia,2011,92.0,3.32,3842.65
...,...,...,...,...,...
2017,Lithuania,2020,91.4,12529.00,304.15
2018,Lithuania,2020,26.3,4.48,304.15
2019,Lithuania,2020,90.1,4.48,304.15
2020,Lithuania,2020,91.4,4.48,304.15


In [124]:
country_specific = all_vars[all_vars['Country'] == 'Australia']

y = country_specific['Immunization']
X = country_specific[['Year', 'Num_Physicians','Num_Hospitals']]
X['Num_Physicians_Sqrt'] = np.sqrt(X['Num_Physicians'])
#X['Num_Physicians_Sq'] = country_specific['Num_Physicians'] ** 2
X['Num_Hospitals_Sqrt'] = np.sqrt(X['Num_Hospitals'])
#X['Num_Physicians_'] = country_specific['Num_Hospitals'] ** 2

# Add a constant term to the independent variables to represent the intercept
X = sm.add_constant(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the OLS model
model = sm.OLS(y_train, X_train)

# Train the model using the fit method
results = model.fit()

# Predict values for the test set
y_pred = results.predict(X_test)

# Get the model summary
summary = results.summary()

# Print the model summary
print(summary)

                            OLS Regression Results                            
Dep. Variable:           Immunization   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                 -0.073
Method:                 Least Squares   F-statistic:                    0.4123
Date:                Sat, 29 Apr 2023   Prob (F-statistic):              0.837
Time:                        23:02:25   Log-Likelihood:                -152.28
No. Observations:                  44   AIC:                             316.6
Df Residuals:                      38   BIC:                             327.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                1014.5242   2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Num_Physicians_Sqrt'] = np.sqrt(X['Num_Physicians'])


In [125]:
q1 = country_specific.quantile(0.25)
q3 = country_specific.quantile(0.75)
iqr = q3 - q1
outliers = ((country_specific < (q1 - 1.5 * iqr)) | (country_specific > (q3 + 1.5 * iqr))).sum()
print('Outliers in the data:\n', outliers)

Outliers in the data:
 Country           0
Immunization      2
Num_Hospitals     0
Num_Physicians    0
Year              0
dtype: int64


  q1 = country_specific.quantile(0.25)
  q3 = country_specific.quantile(0.75)
  outliers = ((country_specific < (q1 - 1.5 * iqr)) | (country_specific > (q3 + 1.5 * iqr))).sum()
