In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
%matplotlib inline

In [2]:
heroinUsers = pd.read_excel('./tables/drugOffenses/heroinUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
heroinSuppliers = pd.read_excel('./tables/drugOffenses/heroinSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)
cannabisUsers = pd.read_excel('./tables/drugOffenses/cannabisUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
cannabisSuppliers = pd.read_excel('./tables/drugOffenses/cannabisSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)
cocaineUsers = pd.read_excel('./tables/drugOffenses/cocaineUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
cocaineSuppliers = pd.read_excel('./tables/drugOffenses/cocaineSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)

In [3]:
def get_mae_r2(suppliers, users):
    data = dict()
    i = 0
    for target_country in users['Country']:
        # making the dataframe for target_country
        df = suppliers[suppliers['Country'] == target_country].T[1:]
        df.columns = ["Suppliers " + target_country]

        for country in list(suppliers['Country']):
            if country != target_country:
                joiner = suppliers[suppliers['Country'] == country].T[1:]
                joiner.columns = ["Suppliers " + country]
                df = df.join(joiner)

        # users in target country
        joiner = users[users['Country'] == target_country].T[1:]

        if not joiner[i].isnull().sum() >= len(joiner/2):
            i+= 1
            joiner = joiner.fillna(joiner.mean())

            joiner.columns = ["Users " + target_country]
            df = df.join(joiner)

            # handeling NAN values
            for country in df.columns:
                if df[country].isnull().sum() >= len(df[country]/2):
                    df = df.drop(country, axis = 1) # if more than half are nan
                else:
                    df[country] = df[country].fillna(df[country].mean())

            X = df.loc[:, df.columns != "Users " + target_country].values
            y = df["Users " + target_country].values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

            # Danilo, samo smeni tuka koj regressor sakas. Dr mislam deka e vo red
            regressor = LinearRegression()
            #regressor = Lasso()
            #regressor = Ridge()
            #regressor = RandomForestRegressor()
            
            regressor.fit(X_train, y_train)

            y_pred = regressor.predict(X_test)

            mae=metrics.mean_absolute_error(y_test, y_pred)
            r2=r2_score(y_test, y_pred)

            data[target_country] = (mae, r2)
        else:
            i+=1
    return data

In [4]:
heroin = get_mae_r2(heroinSuppliers, heroinUsers)
cocaine = get_mae_r2(cocaineSuppliers, cocaineUsers)
cannabis = get_mae_r2(cannabisSuppliers, cannabisUsers)

### Multiple linear regression on heroin, cannabis, cocaine users based on its suppliers. (getting mean absolute error and explained variance)

In [5]:
heroin

{'Austria': (196.3259389275675, 0.8935270044559369),
 'Belgium': (228.23731978147754, 0.2231902276747466),
 'Bulgaria': (133.9211306806041, -1.1134381180749728),
 'Croatia': (163.8227927960843, 0.12189425804611842),
 'Cyprus': (16.986553457906798, 0.560700186549018),
 'Czechia': (34.36888629934337, -1.6484447798860198),
 'France': (834.5689498594962, -0.9854383899469432),
 'Germany': (1725.4028309176988, 0.8304381628206003),
 'Greece': (831.7962159696774, 0.24992540772133198),
 'Hungary': (99.97389684063931, -1.8020592504087767),
 'Italy': (830.9297466123619, 0.16927666855480372),
 'Latvia': (45.12931959396919, -0.551907664377824),
 'Lithuania': (101.47663814874056, -0.7281850952047606),
 'Luxembourg': (54.949086853840676, -0.3761484115618403),
 'Malta': (57.664702856293445, -1.4237468836695384),
 'Poland': (200.32555183340574, -0.10921601376193446),
 'Portugal': (109.51898947940799, 0.11597288277003504),
 'Slovakia': (24.468534066663235, -1.1647143426557616),
 'Slovenia': (56.01863490

In [6]:
cocaine

{'Austria': (220.09071210123147, 0.678149253557105),
 'Belgium': (233.9591303038328, -1.6830136270521483),
 'Bulgaria': (18.671594769646322, -8.725472721540632),
 'Croatia': (178.93093000556718, -0.021950431500328493),
 'Cyprus': (9.328560506617103, -1.5441626305954368),
 'Czechia': (11.278923680409054, -1.7149688158057144),
 'France': (231.09614372511197, -0.30434399441826),
 'Germany': (992.2779358374453, 0.4687564416706381),
 'Greece': (127.42196372004939, 0.3879794672941851),
 'Hungary': (43.6439519783514, -8.937776076972439),
 'Italy': (748.2734590266404, 0.4195649596408747),
 'Latvia': (10.295309946224446, -3.0014270630986593),
 'Lithuania': (11.265079644973687, -11.790403032929408),
 'Luxembourg': (9.952031185208796, -11.26621564705685),
 'Malta': (33.079681614078176, -0.925529661795957),
 'Poland': (23.475637056919094, 0.21488368923146062),
 'Portugal': (90.03892499516506, 0.031037946852024323),
 'Slovakia': (2.247805127907527, -15.8210072982287),
 'Slovenia': (8.95558062485036

In [7]:
cannabis

{'Austria': (3896.8059572607353, -3.129586959888192),
 'Belgium': (938.9369467414297, 0.9330439906588073),
 'Bulgaria': (289.5197950925576, 0.6876840825739916),
 'Croatia': (351.496318036801, -0.45590145769088397),
 'Cyprus': (157.56373346276533, -0.19106341129150062),
 'Czechia': (318.74230090131385, 0.9144200610634611),
 'France': (15853.322649268866, -1.1607777663358063),
 'Germany': (17273.907555949212, -1.8369010177741059),
 'Greece': (1824.7855923968066, -0.8825341108955829),
 'Hungary': (459.2745355994759, -0.470765106412417),
 'Italy': (3088.504233764278, -0.07495523614074129),
 'Latvia': (60.49546520567834, -24.65678499158004),
 'Lithuania': (57.388973642712855, 0.49076272483995165),
 'Luxembourg': (125.30097689565466, -2.6218762070854633),
 'Malta': (23.38863748940599, 0.7212587109617246),
 'Netherlands': (265.7334595734421, -20.626726258834225),
 'Poland': (3852.796038760357, -1.1573122997386576),
 'Portugal': (314.1303572025008, 0.9586937496864232),
 'Slovakia': (25.6937636

## Dataframe

In [8]:
def get_df_reg(suppliers, users):
    regressions = pd.DataFrame()
    i = 0
    for target_country in users['Country']:
        # making the dataframe for target_country
        df = suppliers[suppliers['Country'] == target_country].T[1:]
        df.columns = ["Suppliers " + target_country]

        for country in list(suppliers['Country']):
            if country != target_country:
                joiner = suppliers[suppliers['Country'] == country].T[1:]
                joiner.columns = ["Suppliers " + country]
                df = df.join(joiner)

        # users in target country
        joiner = users[users['Country'] == target_country].T[1:]

        if not joiner[i].isnull().sum() >= len(joiner/2):
            i+= 1
            joiner = joiner.fillna(joiner.mean())

            joiner.columns = ["Users " + target_country]
            df = df.join(joiner)

            # handeling NAN values
            for country in df.columns:
                if df[country].isnull().sum() >= len(df[country]/2):
                    df = df.drop(country, axis = 1) # if more than half are nan
                else:
                    df[country] = df[country].fillna(df[country].mean())

            X = df.loc[:, df.columns != "Users " + target_country].values
            y = df["Users " + target_country].values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

            Linear = LinearRegression()
            LassoLearn = Lasso()
            RidgeLearn = Ridge()
            RandomForest = RandomForestRegressor()
            learners = [Linear, LassoLearn, RidgeLearn, RandomForest]
            for reg in learners:
                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_test)

                mae=metrics.mean_absolute_error(y_test, y_pred)
                r2=r2_score(y_test, y_pred)

                regressions = regressions.append({"Country": target_country, "Regression": str(reg),
                                                 "MAE": mae, "R2": r2}, ignore_index=True)
        else:
            i+=1
    return regressions

In [None]:
heroinReg = get_df_reg(heroinSuppliers, heroinUsers)
cocaineReg = get_df_reg(cocaineSuppliers, cocaineUsers)
cannabisReg = get_df_reg(cannabisSuppliers, cannabisUsers)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [None]:
heroinReg

In [None]:
cocaineReg

In [None]:
cannabisReg

## simple regression on Austria based on years and suppliers in Austria
## this is cringe, dont mind this (i may return to this)

In [None]:
target_country = 'Austria'
df_simple = heroinSuppliers[heroinSuppliers['Country'] == target_country].T[1:]
df_simple.columns = ["Suppliers " + target_country]

joiner = heroinUsers[heroinUsers['Country'] == target_country].T[1:]
joiner.columns = ["Users " + target_country]
df_simple = df_simple.join(joiner)
df_simple = df_simple.drop(2004)
df_simple['Years'] = [i for i in range(2005, 2018)][::-1]

In [None]:
reg_simple = linear_model.LinearRegression()
reg_simple.fit(df_simple[['Years','Suppliers Austria']], df_simple["Users Austria"])

In [None]:
user_list=[]

suppliers = df_simple['Suppliers Austria'].mean()
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")
    
    user_list.append(users)    #on mean suppliers
print("====================")
suppliers = 700
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")