In [49]:
# imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# sklearn imports
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

sns.set()
%matplotlib inline

In [7]:
df = pd.read_csv('data/cleaned.csv')
df.head()

Unnamed: 0,country,total_vaccinations_x,amount_of_days,Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share,region,sub-region,vaccine,total_vaccinations_y,days_to_make_vaccines,amount_vaccines_needed
0,Afghanistan,422200.0,59,39074280,2.33 %,886592,60,652860,-62920.0,4.6,18,25 %,0.50 %,Asia,Southern Asia,0,0.0,0.0,0.0
1,Albania,11295390.0,59,2877239,-0.11 %,-3120,105,27400,-14000.0,1.6,36,63 %,0.04 %,Europe,Southern Europe,0,0.0,0.0,0.0
2,Algeria,75030.0,59,43984569,1.85 %,797990,18,2381740,-10000.0,3.1,29,73 %,0.56 %,Africa,Northern Africa,0,0.0,0.0,0.0
3,Andorra,185420.0,59,77287,0.16 %,123,164,470,0.0,N.A.,N.A.,88 %,0.00 %,Europe,Southern Europe,0,0.0,0.0,0.0
4,Angola,1551757.0,59,33032075,3.27 %,1040977,26,1246700,6413.0,5.6,17,67 %,0.42 %,Africa,Sub-Saharan Africa,0,0.0,0.0,0.0


In [3]:
columns = ['country', 'Density (P/Km²)', 'Land Area (Km²)', 'amount_of_days', 'Population (2020)', 'World Share',
           'region', 'sub-region', 'total_vaccinations_y', 'total_vaccinations_x', 'Urban Pop %']

tmp_df = df[columns]
tmp_df.head()

Unnamed: 0,country,Density (P/Km²),Land Area (Km²),amount_of_days,Population (2020),World Share,region,sub-region,total_vaccinations_y,total_vaccinations_x,Urban Pop %
0,Afghanistan,60,652860,59,39074280,0.50 %,Asia,Southern Asia,0.0,422200.0,25 %
1,Albania,105,27400,59,2877239,0.04 %,Europe,Southern Europe,0.0,11295390.0,63 %
2,Algeria,18,2381740,59,43984569,0.56 %,Africa,Northern Africa,0.0,75030.0,73 %
3,Andorra,164,470,59,77287,0.00 %,Europe,Southern Europe,0.0,185420.0,88 %
4,Angola,26,1246700,59,33032075,0.42 %,Africa,Sub-Saharan Africa,0.0,1551757.0,67 %


In [4]:
def convert_float(value):
    return float(value[:4])
    
tmp_df['World Share'] = tmp_df['World Share'].apply(convert_float)
tmp_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df['World Share'] = tmp_df['World Share'].apply(convert_float)


Unnamed: 0,country,Density (P/Km²),Land Area (Km²),amount_of_days,Population (2020),World Share,region,sub-region,total_vaccinations_y,total_vaccinations_x,Urban Pop %
0,Afghanistan,60,652860,59,39074280,0.5,Asia,Southern Asia,0.0,422200.0,25 %
1,Albania,105,27400,59,2877239,0.04,Europe,Southern Europe,0.0,11295390.0,63 %
2,Algeria,18,2381740,59,43984569,0.56,Africa,Northern Africa,0.0,75030.0,73 %
3,Andorra,164,470,59,77287,0.0,Europe,Southern Europe,0.0,185420.0,88 %
4,Angola,26,1246700,59,33032075,0.42,Africa,Sub-Saharan Africa,0.0,1551757.0,67 %


In [9]:
def convert_float_space(value):
    if value != 'N.A.':
        return float(value.split(' ')[0])
    else:
        return 0
    
tmp_df['Urban Pop %'] = tmp_df['Urban Pop %'].apply(convert_float_space)
tmp_df.head()

Unnamed: 0,country,Density (P/Km²),Land Area (Km²),amount_of_days,Population (2020),World Share,region,sub-region,total_vaccinations_y,total_vaccinations_x,Urban Pop %
0,Afghanistan,60,652860,59,39074280,0.5,Asia,Southern Asia,0.0,422200.0,25.0
1,Albania,105,27400,59,2877239,0.04,Europe,Southern Europe,0.0,11295390.0,63.0
2,Algeria,18,2381740,59,43984569,0.56,Africa,Northern Africa,0.0,75030.0,73.0
3,Andorra,164,470,59,77287,0.0,Europe,Southern Europe,0.0,185420.0,88.0
4,Angola,26,1246700,59,33032075,0.42,Africa,Sub-Saharan Africa,0.0,1551757.0,67.0


In [18]:
cond = tmp_df['Urban Pop %'] != 'N.A.'
tmp_df = tmp_df[cond]
len(tmp_df)

194

In [19]:
le = LabelEncoder()

for col in tmp_df.columns.values:
    if tmp_df[col].dtype == 'object':
        tmp_df[col]=le.fit_transform(tmp_df[col])
        
tmp_df.head()

Unnamed: 0,country,Density (P/Km²),Land Area (Km²),amount_of_days,Population (2020),World Share,region,sub-region,total_vaccinations_y,total_vaccinations_x,Urban Pop %
0,0,60,652860,59,39074280,0.5,2,12,0.0,422200.0,25.0
1,1,105,27400,59,2877239,0.04,3,13,0.0,11295390.0,63.0
2,2,18,2381740,59,43984569,0.56,0,7,0.0,75030.0,73.0
3,3,164,470,59,77287,0.0,3,13,0.0,185420.0,88.0
4,4,26,1246700,59,33032075,0.42,0,14,0.0,1551757.0,67.0


In [125]:
X = tmp_df[['Density (P/Km²)', 'Population (2020)', 'total_vaccinations_y', 'Urban Pop %', 
            'sub-region']]
y = tmp_df['total_vaccinations_x']

scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

In [126]:
X_train, X_test, y_train, y_test = train_test_split(
   scaled_X, y, test_size=0.20, random_state=42)

In [127]:
model = LinearRegression()
model.fit(X_train, y_train)
r2_test = model.score(X_test,y_test)
r2_train = model.score(X_train,y_train)
print(r2_train, r2_test)

0.7818535085152625 0.3231386147880635


In [128]:
model = LinearRegression()
model.fit(X_train, y_train)
r2_test = model.score(X_test,y_test)
r2_train = model.score(X_train,y_train)
print(r2_train, r2_test)

0.7818535085152625 0.3231386147880635


In [129]:
p = model.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': p})
df

Unnamed: 0,Actual,Predicted
138,4231.0,-226704700.0
16,3634823.0,-153472800.0
155,14142100.0,33670370.0
96,12639570.0,71433480.0
68,1923281.0,-46878840.0
153,582325.0,239887000.0
55,35546.0,181104200.0
15,255177500.0,804277700.0
112,17086240.0,-106462600.0
111,45041180.0,293507800.0
