In [1]:
#Import PySpark library

import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

IndexError: list index out of range

In [None]:
#Import dataset

drugdata = spark.read.csv('drugdata_names_dirtied.csv',header=True,inferSchema=True)

# View data.

drugdata.show()

In [None]:
# View information about the data.

drugdata.printSchema()

In [None]:
# View counts, average values, minimums, maximums, standard deviation. Because most of my columns are strings, 
# this isn't as useful as it would be for some data types, but there is some useful information.

drugdata.describe().show()

In [None]:
# We could just look at the numerical data to make it easier to read. I've done two separate tables because
# it's too wide otherwise, and becomes hard to read.

drugdata.select('Neuroticism','Extraversion','Openness to experience','Conscientiousness').describe().show()
drugdata.select('Agreeableness','Impulsiveness','Sensation seeking').describe().show()

In [None]:
# It would be a good idea to explore the data in greate'r depth. A quick look at the table suggested that some
# attributes had a particular value that was highly overrepresented. Let's see.

drugdata.groupBy('Age').count().orderBy('Age').show()
drugdata.groupBy('Gender').count().orderBy('Gender').show()
drugdata.groupBy('Education').count().orderBy('Education').show()
drugdata.groupBy('Country').count().orderBy('Country').show()
drugdata.groupBy('Ethnicity').count().orderBy('Ethnicity').show()

In [None]:
# Several columns have a count less than the total number of rows, indicating that there are missing values.
# I can see how many values are missing by getting a row count for the original dataset, then dropping all rows with
# missing values and getting a row count for the resulting table.

print("Total data points:", drugdata.count())
dropped_data = drugdata.na.drop()
print("Total data points:", dropped_data.count())

In [None]:
# There are 9 rows containing missing values. How I respond to these missing values will depend on the data type.
# First I check whether any numerical values are missing.

missing_field_data = drugdata.na.drop(subset="Neuroticism")
print("Total data points (Neuroticism):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Extraversion")
print("Total data points (Extraversion):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Openness to experience")
print("Total data points (Openness to experience):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Agreeableness")
print("Total data points (Agreeableness):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Conscientiousness")
print("Total data points (Conscientiousness):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Impulsiveness")
print("Total data points (Impulsiveness):", missing_field_data.count())

missing_field_data = drugdata.na.drop(subset="Sensation seeking")
print("Total data points (Sensation seeking):", missing_field_data.count())

In [None]:
# Only Agreeableness is missing a value, so I'll only work on that column for now.
# I replace the missing value in Agreeableness with the mean value of the column.

# Import necessary function
from pyspark.sql.functions import mean

# Extract mean value
mean_agreeableness = drugdata.select(mean('Agreeableness')).collect()
mean_agreeableness = mean_agreeableness[0]
mean_agreeableness = mean_agreeableness[0]

# Print mean value for checking purposes
print(mean_agreeableness)

# Replace missing value with mean value
drugdata = drugdata.na.fill(mean_agreeableness, subset=['Agreeableness'])

# Test that there are now no missing values
missing_field_data = drugdata.na.drop(subset="Agreeableness")
print("Total data points (Agreeableness):", missing_field_data.count())

In [None]:
# Because drug use is the focus of this data mining task, records that are missing drug use information are seriously
# compromised in terms of usefulness and can be deleted.

drugdata = drugdata.na.drop(subset='Alcohol')
drugdata = drugdata.na.drop(subset='Amphetamine')
drugdata = drugdata.na.drop(subset='Amyl nitrate')
drugdata = drugdata.na.drop(subset='Benzos')
drugdata = drugdata.na.drop(subset='Caffeine')
drugdata = drugdata.na.drop(subset='Cannabis')
drugdata = drugdata.na.drop(subset='Chocolate')
drugdata = drugdata.na.drop(subset='Cocaine')
drugdata = drugdata.na.drop(subset='Crack')
drugdata = drugdata.na.drop(subset='Ecstacy')
drugdata = drugdata.na.drop(subset='Heroin')
drugdata = drugdata.na.drop(subset='Ketamine')
drugdata = drugdata.na.drop(subset='Legal highs')
drugdata = drugdata.na.drop(subset='LSD')
drugdata = drugdata.na.drop(subset='Meth')
drugdata = drugdata.na.drop(subset='Mushrooms')
drugdata = drugdata.na.drop(subset='Nicotine')
drugdata = drugdata.na.drop(subset='Semeron')
drugdata = drugdata.na.drop(subset='Volatile')
print("Total data points:", drugdata.count())

In [None]:
# We still have missing values, but they are not worth deleting entire records over.

# We know any remaining missing values are in one of the first few columns, so we can just replace any missing values
# in those columns.
drugdata = drugdata.na.fill('Unknown', subset=['Age','Gender','Education','Country','Ethnicity'])

# We can return a count to make sure there are no more missing values.
print("Total data points:", drugdata.na.drop().count())

In [None]:
# Even though there are no more missing values, more can be done to enhance the quality of this dataset.

# Semeron is a fictitious drug, so any respondent who claims to have used Semeron is unreliable. 
# We can filter these records out.
drugdata = drugdata.filter("Semeron = 'CL0'")

print("Total data points:", drugdata.count())

In [None]:
# We don't want every column. Semeron is now redundant, and caffeine and chocolate are not very useful either.

drugdata = drugdata.select('Age','Gender','Education','Country','Ethnicity',
                           'Neuroticism','Extraversion','Openness to experience',
                           'Agreeableness','Conscientiousness','Impulsiveness',
                           'Sensation seeking','Alcohol','Amphetamine','Amyl nitrate',
                           'Benzos','Cannabis','Cocaine','Crack','Ecstacy','Heroin',
                           'Ketamine','Legal highs','LSD','Meth','Mushrooms','Nicotine','Volatile')
drugdata.show()

In [None]:
# Let's look at the distribution of personality traits by drug use status for some of our target drugs.

from pyspark.sql.functions import format_number, col


# Personality data by heroin use status.
heroin_stats = drugdata.groupBy('Heroin').mean()
heroin_stats = heroin_stats.select('Heroin',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
heroin_stats = heroin_stats.select('Heroin',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
heroin_stats.orderBy('Heroin').show()

In [None]:
# We can see that some traits seem to correlate with heroin use. Let's visualize them.

drugdata.toPandas().boxplot(column='Neuroticism', by = 'Heroin')
drugdata.toPandas().boxplot(column='Agreeableness', by = 'Heroin')
drugdata.toPandas().boxplot(column='Sensation seeking', by = 'Heroin')

In [None]:
# Personality data by cocaine use status.
cocaine_stats = drugdata.groupBy('Cocaine').mean()
cocaine_stats = cocaine_stats.select('Cocaine',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
cocaine_stats = cocaine_stats.select('Cocaine',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
cocaine_stats.orderBy('Cocaine').show()

In [None]:
# Again, some traits seem to correlate with cocaine use status. It will be interesting to visualize them.

drugdata.toPandas().boxplot(column='Neuroticism', by = 'Cocaine')
drugdata.toPandas().boxplot(column='Agreeableness', by = 'Cocaine')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Cocaine')
drugdata.toPandas().boxplot(column='Sensation seeking', by = 'Cocaine')

In [None]:
# Personality data by cannabis use status.
cannabis_stats = drugdata.groupBy('Cannabis').mean()
cannabis_stats = cannabis_stats.select('Cannabis',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
cannabis_stats = cannabis_stats.select('Cannabis',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
cannabis_stats.orderBy('Cannabis').show()

In [None]:
# Visualize certain correlations by cannabis use status

drugdata.toPandas().boxplot(column='Openness to experience', by = 'Cannabis')
drugdata.toPandas().boxplot(column='Conscientiousness', by = 'Cannabis')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Cannabis')
drugdata.toPandas().boxplot(column='Sensation seeking', by = 'Cannabis')

In [None]:
# Personality data by meth use status.
meth_stats = drugdata.groupBy('Meth').mean()
meth_stats = meth_stats.select('Meth',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
meth_stats = meth_stats.select('Meth',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
meth_stats.orderBy('Meth').show()

In [None]:
drugdata.toPandas().boxplot(column='Neuroticism', by = 'Meth')
drugdata.toPandas().boxplot(column='Extraversion', by = 'Meth')
drugdata.toPandas().boxplot(column='Conscientiousness', by = 'Meth')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Meth')

In [None]:
# Personality data by nicotine use status.
nicotine_stats = drugdata.groupBy('Nicotine').mean()
nicotine_stats = nicotine_stats.select('Nicotine',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
nicotine_stats = nicotine_stats.select('Nicotine',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
nicotine_stats.orderBy('Nicotine').show()

In [None]:
drugdata.toPandas().boxplot(column='Neuroticism', by = 'Nicotine')
drugdata.toPandas().boxplot(column='Conscientiousness', by = 'Nicotine')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Nicotine')
drugdata.toPandas().boxplot(column='Sensation seeking', by = 'Nicotine')

In [None]:
# Personality data by benzodiazepine use status.
benzo_stats = drugdata.groupBy('Benzos').mean()
benzo_stats = benzo_stats.select('Benzos',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
benzo_stats = benzo_stats.select('Benzos',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
benzo_stats.orderBy('Benzos').show()

In [None]:
drugdata.toPandas().boxplot(column='Neuroticism', by = 'Benzos')
drugdata.toPandas().boxplot(column='Extraversion', by = 'Benzos')
drugdata.toPandas().boxplot(column='Conscientiousness', by = 'Benzos')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Benzos')

In [None]:
# Personality data by amphetamine use status.
amphetamine_stats = drugdata.groupBy('Amphetamine').mean()
amphetamine_stats = amphetamine_stats.select('Amphetamine',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
amphetamine_stats = amphetamine_stats.select('Amphetamine',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
amphetamine_stats.orderBy('Amphetamine').show()

In [None]:
drugdata.toPandas().boxplot(column='Neuroticism', by = 'Amphetamine')
drugdata.toPandas().boxplot(column='Agreeableness', by = 'Amphetamine')
drugdata.toPandas().boxplot(column='Conscientiousness', by = 'Amphetamine')
drugdata.toPandas().boxplot(column='Impulsiveness', by = 'Amphetamine')
drugdata.toPandas().boxplot(column='Sensation seeking', by = 'Amphetamine')

In [None]:
# That is some interesting preliminary information. Now to do some machine learning.

# Import relevant libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.classification import LogisticRegression

In [None]:
# Reduce dimensionality of table. We've already got rid of some of the truly irrelevant information,
# but there's also good reason to doubt the predictive power of Ethnicity and Country, given their distribution.
# We can also get rid of a few more drugs that seem to be less relevant.

drugdata = drugdata.select('Age','Gender','Education','Neuroticism','Extraversion','Openness to experience','Agreeableness',
                           'Conscientiousness','Impulsiveness','Sensation seeking','Alcohol','Amphetamine','Benzos',
                           'Cannabis','Cocaine','Crack','Ecstacy','Heroin','Meth','Nicotine')

drugdata.printSchema()

In [None]:
# My first target variable is cocaine. I want to get some idea of class distribution.

drugdata.toPandas().groupby('Cocaine').size().plot(kind='bar')

In [None]:
# I'll use cocaine as my target variable. I want to do binomial regression, so I need to make it into a binary feature.
cocaine_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Cocaine')
cocaine_dataset = cocaine_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Cocaine')

# Let's make sure it worked.
cocaine_dataset.select('Cocaine').show()

In [None]:
# Much of the information is in string form, so we need to convert it in order to work with it.
# Code adapted from tutorial examples.
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

# I'll use cocaine as my target feature for this iteration, so its output is given the label 'label'.
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='label')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                      'cannabisVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec','Neuroticism',
                                      'Extraversion','Openness to experience','Agreeableness','Conscientiousness',
                                      'Impulsiveness','Sensation seeking'], outputCol="features")

In [None]:
# Now that everything has been set up, I'll set up a pipeline to call the indexer, encoder and assembler.
# Code adapted from tutorial examples.

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(cocaine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(cocaine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split data and train a logistic regression model, then test on the testing set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

In [None]:
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Show precision & recall.

r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# I'll repeat the above modelling for other target variables.

drugdata.toPandas().groupby('Cannabis').size().plot(kind='bar')

In [None]:
# I go through the process of preparing the dataset again, this time using cannabis as my target variable.
cannabis_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Cannabis')
cannabis_dataset = cannabis_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Cannabis')

cannabis_dataset.select('Cannabis').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='label')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                      'cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec','Neuroticism',
                                      'Extraversion','Openness to experience','Agreeableness','Conscientiousness',
                                      'Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(cannabis_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(cannabis_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# Repeat for other target variables.

drugdata.toPandas().groupby('Nicotine').size().plot(kind='bar')

In [None]:
# I go through the process of preparing the dataset again, this time using nicotine as my target variable.
nicotine_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Nicotine')
nicotine_dataset = nicotine_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Nicotine')

nicotine_dataset.select('Nicotine').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='label')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, assembler])

pipeline_model = pipeline.fit(nicotine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(nicotine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# Repeat for other target variables, in this case heroin.

drugdata.toPandas().groupby('Heroin').size().plot(kind='bar')

In [None]:
heroin_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Heroin')
heroin_dataset = heroin_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Heroin')

heroin_dataset.select('Heroin').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='label')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(heroin_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(heroin_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# Repeat for other target variables, in this case amphetamine.

drugdata.toPandas().groupby('Amphetamine').size().plot(kind='bar')

In [None]:
amphetamine_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Amphetamine')
amphetamine_dataset = amphetamine_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Amphetamine')

amphetamine_dataset.select('Amphetamine').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='label')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(amphetamine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(amphetamine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# Repeat for other target variables, in this case benzodiazepines.

drugdata.toPandas().groupby('Benzos').size().plot(kind='bar')

In [None]:
benzos_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Benzos')
benzos_dataset = benzos_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Benzos')

benzos_dataset.select('Benzos').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='label')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(benzos_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(benzos_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# Repeat for other target variables, in this case methamphetamine.

drugdata.toPandas().groupby('Meth').size().plot(kind='bar')

In [None]:
meth_dataset = drugdata.replace(['CL0','CL1','CL2'],'Non-user','Meth')
meth_dataset = meth_dataset.replace(['CL3','CL4','CL5','CL6'],'User','Meth')

meth_dataset.select('Meth').show()

age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='label')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(meth_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(meth_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Split the data, train the model and classify the test set.

train_data, test_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))
lr_model = LogisticRegression(featuresCol='features',labelCol='label')
lr_model = lr_model.fit(train_data)
results = lr_model.transform(test_data)

In [None]:
# Visualize and evaluate the results. 
# Code adapted from https://gist.github.com/ispmarin/05feacd8be5e2901cf2b35453a148060 and tutorial examples.

# Generate a confusion matrix.
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Plot the ROC.
training_summary = lr_model.summary

# Convert the DataFrame to a Pandas DataFrame.
ROC = training_summary.roc.toPandas()

# Plot the true positive and false positive rates.
plt.plot(ROC['FPR'],ROC['TPR'])

# Define the labels.
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

# Print the AUC statistic. 
print('Area Under the Curve: ' + str(training_summary.areaUnderROC))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

pr = training_summary.pr.toPandas()

# Plot model recall and precision.
plt.plot(pr['recall'],pr['precision'])

# Define the labels and show the graph. 
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
# On this iteration I will use the Decision Tree algorithm.

# I'll use cocaine as my target feature for this iteration, so its output is given the label 'label'.
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='label')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                      'cannabisVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec','Neuroticism',
                                      'Extraversion','Openness to experience','Agreeableness','Conscientiousness',
                                      'Impulsiveness','Sensation seeking'], outputCol="features")
pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(cocaine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(cocaine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]
# summary only
print(treeModel)

In [None]:
tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
# I go through the process of preparing the dataset again, this time using cannabis as my target variable.
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='label')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                      'cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec','Neuroticism',
                                      'Extraversion','Openness to experience','Agreeableness','Conscientiousness',
                                      'Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(cannabis_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(cannabis_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
# I go through the process of preparing the dataset again, this time using nicotine as my target variable.
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='label')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, assembler])

pipeline_model = pipeline.fit(nicotine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(nicotine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='label')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(heroin_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(heroin_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='label')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(amphetamine_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(amphetamine_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='label')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='methIndex')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
meth_encoder = OneHotEncoder(inputCol='methIndex',outputCol='methVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','methVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, meth_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(benzos_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(benzos_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
age_indexer = StringIndexer(inputCol='Age',outputCol='ageIndex')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='genderIndex')
education_indexer = StringIndexer(inputCol='Education',outputCol='educationIndex')
alcohol_indexer = StringIndexer(inputCol='Alcohol',outputCol='alcoholIndex')
amphetamine_indexer = StringIndexer(inputCol='Amphetamine',outputCol='amphetamineIndex')
benzos_indexer = StringIndexer(inputCol='Benzos',outputCol='benzosIndex')
cannabis_indexer = StringIndexer(inputCol='Cannabis',outputCol='cannabisIndex')
cocaine_indexer = StringIndexer(inputCol='Cocaine',outputCol='cocaineIndex')
crack_indexer = StringIndexer(inputCol='Crack',outputCol='crackIndex')
ecstacy_indexer = StringIndexer(inputCol='Ecstacy',outputCol='ecstacyIndex')
heroin_indexer = StringIndexer(inputCol='Heroin',outputCol='heroinIndex')
meth_indexer = StringIndexer(inputCol='Meth',outputCol='label')
nicotine_indexer = StringIndexer(inputCol='Nicotine',outputCol='nicotineIndex')


age_encoder = OneHotEncoder(inputCol='ageIndex',outputCol='ageVec')
gender_encoder = OneHotEncoder(inputCol='genderIndex',outputCol='genderVec')
education_encoder = OneHotEncoder(inputCol='educationIndex',outputCol='educationVec')
alcohol_encoder = OneHotEncoder(inputCol='alcoholIndex',outputCol='alcoholVec')
amphetamine_encoder = OneHotEncoder(inputCol='amphetamineIndex',outputCol='amphetamineVec')
benzos_encoder = OneHotEncoder(inputCol='benzosIndex',outputCol='benzosVec')
cannabis_encoder = OneHotEncoder(inputCol='cannabisIndex',outputCol='cannabisVec')
cocaine_encoder = OneHotEncoder(inputCol='cocaineIndex',outputCol='cocaineVec')
crack_encoder = OneHotEncoder(inputCol='crackIndex',outputCol='crackVec')
ecstacy_encoder = OneHotEncoder(inputCol='ecstacyIndex',outputCol='ecstacyVec')
heroin_encoder = OneHotEncoder(inputCol='heroinIndex',outputCol='heroinVec')
nicotine_encoder = OneHotEncoder(inputCol='nicotineIndex',outputCol='nicotineVec')

assembler = VectorAssembler(inputCols=['ageVec','genderVec','educationVec','alcoholVec','amphetamineVec','benzosVec',
                                       'cannabisVec','cocaineVec','crackVec','ecstacyVec','heroinVec','nicotineVec',
                                       'Neuroticism','Extraversion','Openness to experience','Agreeableness',
                                       'Conscientiousness','Impulsiveness','Sensation seeking'], outputCol="features")

pipeline = Pipeline(stages=[age_indexer, gender_indexer, education_indexer, alcohol_indexer, amphetamine_indexer,
                            benzos_indexer, cannabis_indexer, cocaine_indexer, crack_indexer, ecstacy_indexer,
                            heroin_indexer, meth_indexer, nicotine_indexer, age_encoder, gender_encoder, education_encoder,
                            alcohol_encoder, amphetamine_encoder, benzos_encoder, cannabis_encoder, cocaine_encoder, crack_encoder,
                            ecstacy_encoder, heroin_encoder, nicotine_encoder, assembler])

pipeline_model = pipeline.fit(meth_dataset)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(meth_dataset)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')

In [None]:
# Run decision tree and evaluate the model.
# Code adapted from examples in the Spark documentation.

(trainingData, testData) = pipe_df.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[dt])

model = pipeline.fit(trainingData)

results = model.transform(testData)

results.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[0]

# summary only
print(treeModel)

tp = results[(results.label == 1.0) & (results.prediction == 1.0)].count()
tn = results[(results.label == 0.0) & (results.prediction == 0.0)].count()
fp = results[(results.label == 0.0) & (results.prediction == 1.0)].count()
fn = results[(results.label == 1.0) & (results.prediction == 0.0)].count()
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total: " + str(results.count()))

# Show precision & recall.
r = float(tp)/(tp + fn)
print("Recall:", r)

p = float(tp) / (tp + fp)
print("Precision:", p)

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Amphetamine').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Amphetamine').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Benzos').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Benzos').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Cannabis').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Cannabis').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Cocaine').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Cocaine').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Heroin').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Heroin').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Meth').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Meth').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Female'").toPandas().groupby('Nicotine').size().plot(kind='bar')

In [None]:
drugdata.filter("Gender = 'Male'").toPandas().groupby('Nicotine').size().plot(kind='bar')

In [None]:
# Personality data by gender.
gender_stats = drugdata.groupBy('Gender').mean()
gender_stats = gender_stats.select('Gender',
                                   format_number('avg(Neuroticism)',2),
                                   format_number('avg(Extraversion)',2),
                                   format_number('avg(Openness to experience)',2),
                                   format_number('avg(Agreeableness)',2),
                                   format_number('avg(Conscientiousness)',2),
                                   format_number('avg(Impulsiveness)',2),
                                   format_number('avg(Sensation seeking)',2))
gender_stats = gender_stats.select('Gender',
                                   col('format_number(avg(Neuroticism), 2)').alias('Neuroticism'),
                                   col('format_number(avg(Extraversion), 2)').alias('Extraversion'),
                                   col('format_number(avg(Openness to experience), 2)').alias('Openness...'),
                                   col('format_number(avg(Agreeableness), 2)').alias('Agreeableness'),
                                   col('format_number(avg(Conscientiousness), 2)').alias('Conscientiousness'),
                                   col('format_number(avg(Impulsiveness), 2)').alias('Impulsiveness'),
                                   col('format_number(avg(Sensation seeking), 2)').alias('Sensation seeking'))
gender_stats.orderBy('Gender').show()