In [39]:
from pyspark.sql import SQLContext, types, SparkSession
from pyspark.sql.functions import *
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import re
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

## loading dataframe

In [40]:
my_mapping_dataframe = spark.read.option('header', True).csv("df_diseases_new.csv")
my_mapping_dataframe.printSchema()

root
 |-- name: string (nullable = true)
 |-- categories: string (nullable = true)



# Cleaning data

In [41]:
my_mapping_dataframe = my_mapping_dataframe.withColumn("name", regexp_replace(trim(lower(col("name"))), "[ ](?=[ ])|[^-_,A-Za-z0-9 ]+", ""))
my_mapping_dataframe = my_mapping_dataframe.withColumn("categories", regexp_replace(trim(lower(col("categories"))), "[ ](?=[ ])|[^-_,A-Za-z0-9 ]+", ""))

my_mapping_dataframe = my_mapping_dataframe.withColumn("categories", when(col("categories") == "category 1", 0)
.when(col("categories") == "category 2", 1)
.when(col("categories") == "category 3", 2)
.when(col("categories") == "category 4", 3)                                                       
)

# Splitting the data into x's an y's

In [44]:
X = my_mapping_dataframe.drop("categories").toPandas()
X = pd.get_dummies(X, columns=["name"])
cols=[]
for column in X.columns:
    new_column = str(column)
    new_column = re.sub('[\W_]+', ' ', new_column).strip().replace(" ","_").lower()
    cols.append(new_column)


X.columns = cols
y = my_mapping_dataframe.drop("name").toPandas()


# Test Train split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=15)

