In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
%matplotlib inline

In [2]:
heroinUsers = pd.read_excel('./tables/drugOffenses/heroinUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
heroinSuppliers = pd.read_excel('./tables/drugOffenses/heroinSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)
cannabisUsers = pd.read_excel('./tables/drugOffenses/cannabisUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
cannabisSuppliers = pd.read_excel('./tables/drugOffenses/cannabisSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)
cocaineUsers = pd.read_excel('./tables/drugOffenses/cocaineUsers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30) # for a clean dataframe
cocaineSuppliers = pd.read_excel('./tables/drugOffenses/cocaineSuppliers.xlsx', 
                            usecols=[i for i in range(15)], nrows=30)

In [3]:
def get_mae_r2(suppliers, users):
    data = dict()
    i = 0
    for target_country in users['Country']:
        # making the dataframe for target_country
        df = suppliers[suppliers['Country'] == target_country].T[1:]
        df.columns = ["Suppliers " + target_country]

        for country in list(suppliers['Country']):
            if country != target_country:
                joiner = suppliers[suppliers['Country'] == country].T[1:]
                joiner.columns = ["Suppliers " + country]
                df = df.join(joiner)

        # users in target country
        joiner = users[users['Country'] == target_country].T[1:]

        if not joiner[i].isnull().sum() >= len(joiner/2):
            i+= 1

            joiner = joiner.fillna(joiner.mean())

            joiner.columns = ["Users " + target_country]
            df = df.join(joiner)

            # handeling NAN values
            for country in df.columns:
                if df[country].isnull().sum() >= len(df[country]/2):
                    df = df.drop(country, axis = 1) # if more than half are nan
                else:
                    df[country] = df[country].fillna(df[country].mean())


            X = df.loc[:, df.columns != "Users " + target_country].values
            y = df["Users " + target_country].values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

            regressor = LinearRegression()  
            regressor.fit(X_train, y_train)

            y_pred = regressor.predict(X_test)

            mae=metrics.mean_absolute_error(y_test, y_pred)
            r2=r2_score(y_test, y_pred)

            data[target_country] = (mae, r2)
        else:
            i+=1
    return data

In [None]:
heroin = get_mae_r2(heroinSuppliers, heroinUsers)
cocaine = get_mae_r2(cocaineSuppliers, cocaineUsers)
cannabis = get_mae_r2(cannabisSuppliers, cannabisUsers)

### Multiple linear regression on heroin, cannabis, cocaine users based on its suppliers. (getting mean absolute error and explained variance)

In [None]:
heroin

In [None]:
cocaine

In [None]:
cannabis

## simple regression on Austria based on years and suppliers in Austria
## this is cringe, dont mind this (i may return to this)

In [None]:
target_country = 'Austria'
df_simple = heroinSuppliers[heroinSuppliers['Country'] == target_country].T[1:]
df_simple.columns = ["Suppliers " + target_country]

joiner = heroinUsers[heroinUsers['Country'] == target_country].T[1:]
joiner.columns = ["Users " + target_country]
df_simple = df_simple.join(joiner)
df_simple = df_simple.drop(2004)
df_simple['Years'] = [i for i in range(2005, 2018)][::-1]

In [None]:
reg_simple = linear_model.LinearRegression()
reg_simple.fit(df_simple[['Years','Suppliers Austria']], df_simple["Users Austria"])

In [None]:
user_list=[]

suppliers = df_simple['Suppliers Austria'].mean()
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")
    
    user_list.append(users)    #on mean suppliers
print("====================")
suppliers = 700
for year in range(2018, 2025):
    users = reg_simple.predict([[year, suppliers]])[0]
    print("In the year " + str(year) + " the model has predicted " + 
          str(users) + " users in Austria on " + str(suppliers) + " suppliers")