In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
%matplotlib inline

In [2]:
heroinUsers = pd.read_excel('./tables/drugOffenses/heroinUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
heroinSuppliers = pd.read_excel('./tables/drugOffenses/heroinSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)



In [3]:
data = dict()
i = 0
for target_country in heroinUsers['Country']:
    # making the dataframe for target_country
    df = heroinSuppliers[heroinSuppliers['Country'] == target_country].T[1:]
    df.columns = ["Suppliers " + target_country]

    for country in list(heroinSuppliers['Country']):
        if country != target_country:
            joiner = heroinSuppliers[heroinSuppliers['Country'] == country].T[1:]
            joiner.columns = ["Suppliers " + country]
            df = df.join(joiner)

    # users in target country
    joiner = heroinUsers[heroinUsers['Country'] == target_country].T[1:]
    
    if not joiner[i].isnull().sum() >= len(joiner/2):
        i+= 1
        
        joiner = joiner.fillna(joiner.mean())
        
        joiner.columns = ["Users " + target_country]
        df = df.join(joiner)

        # handeling NAN values
        for country in df.columns:
            if df[country].isnull().sum() >= len(df[country]/2):
                #print("in " + country + " ima poveke od half")
                df = df.drop(country, axis = 1) # if more than half are nan
            else:
                df[country] = df[country].fillna(df[country].mean())


        X = df.loc[:, df.columns != "Users " + target_country].values
        y = df["Users " + target_country].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

        regressor = LinearRegression()  
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)

        mae=metrics.mean_absolute_error(y_test, y_pred)
        r2=r2_score(y_test, y_pred)

        data[target_country] = (mae, r2)
    else:
        i+=1

In [4]:
data

{'Austria': (196.3259389275675, 0.8935270044559369),
 'Belgium': (228.23731978147754, 0.2231902276747466),
 'Bulgaria': (133.9211306806041, -1.1134381180749728),
 'Croatia': (163.8227927960843, 0.12189425804611842),
 'Cyprus': (16.986553457906798, 0.560700186549018),
 'Czechia': (34.36888629934337, -1.6484447798860198),
 'France': (834.5689498594962, -0.9854383899469432),
 'Germany': (1725.4028309176988, 0.8304381628206003),
 'Greece': (831.7962159696774, 0.24992540772133198),
 'Hungary': (99.97389684063931, -1.8020592504087767),
 'Italy': (830.9297466123619, 0.16927666855480372),
 'Latvia': (45.12931959396919, -0.551907664377824),
 'Lithuania': (101.47663814874056, -0.7281850952047606),
 'Luxembourg': (54.949086853840676, -0.3761484115618403),
 'Malta': (57.664702856293445, -1.4237468836695384),
 'Poland': (200.32555183340574, -0.10921601376193446),
 'Portugal': (109.51898947940799, 0.11597288277003504),
 'Slovakia': (24.468534066663235, -1.1647143426557616),
 'Slovenia': (56.01863490

## simple regression on Austria based on years and suppliers in Austria
## this is cringe, dont mind this (i may return to this)

In [5]:
target_country = 'Austria'
df_simple = heroinSuppliers[heroinSuppliers['Country'] == target_country].T[1:]
df_simple.columns = ["Suppliers " + target_country]

joiner = heroinUsers[heroinUsers['Country'] == target_country].T[1:]
joiner.columns = ["Users " + target_country]
df_simple = df_simple.join(joiner)
df_simple = df_simple.drop(2004)
df_simple['Years'] = [i for i in range(2005, 2018)][::-1]

In [6]:
reg_simple = linear_model.LinearRegression()
reg_simple.fit(df_simple[['Years','Suppliers Austria']], df_simple["Users Austria"])

LinearRegression()

In [7]:
user_list=[]

suppliers = df_simple['Suppliers Austria'].mean()
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")
    
    user_list.append(users)    #on mean suppliers
print("====================")
suppliers = 700
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")

In the year 2018 the model has predicted 1020.3006127455155 users in Austria on 586.5384615384615 suppliers
In the year 2019 the model has predicted 880.9919090718031 users in Austria on 586.5384615384615 suppliers
In the year 2020 the model has predicted 741.6832053980906 users in Austria on 586.5384615384615 suppliers
In the year 2021 the model has predicted 602.3745017243782 users in Austria on 586.5384615384615 suppliers
In the year 2022 the model has predicted 463.06579805066576 users in Austria on 586.5384615384615 suppliers
In the year 2023 the model has predicted 323.7570943769533 users in Austria on 586.5384615384615 suppliers
In the year 2024 the model has predicted 184.44839070324088 users in Austria on 586.5384615384615 suppliers
In the year 2018 the model has predicted 1379.8000689211185 users in Austria on 700 suppliers
In the year 2019 the model has predicted 1240.491365247406 users in Austria on 700 suppliers
In the year 2020 the model has predicted 1101.1826615736936 u

In [8]:
years = [i for i in range (2018,2025)]
plt.figure(figsize=(15,7))
plt.plot(years, user_list)

[<matplotlib.lines.Line2D at 0x7fbb83a2d160>]