# IMPORTING THE DEPENDENCIES

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
import requests


### READING THE DATA

In [4]:
data = pd.read_csv('../Data/Training.csv')

In [5]:
### PRINTING THE FIRST 5 ROWS OF THE DATASET

In [6]:
data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,diseases
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [7]:
### PRINTING THE NUMBER OF ROWS AND COLUMNS

In [8]:
data.shape

(4920, 133)

In [9]:
##GETTING THE STATISTICAL MEASURES OF THE DATA
data.describe()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,...,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0
mean,0.137805,0.159756,0.021951,0.045122,0.021951,0.162195,0.139024,0.045122,0.045122,0.021951,...,0.021951,0.021951,0.021951,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171,0.023171
std,0.34473,0.366417,0.146539,0.207593,0.146539,0.368667,0.346007,0.207593,0.207593,0.146539,...,0.146539,0.146539,0.146539,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461,0.150461
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
#CHECKING IF THERE IS ANY MISSING VALUES
data.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
diseases                0
Length: 133, dtype: int64

In [14]:
#CHECKING THE NUMBER OF ROWS PER DISEASE
data['diseases'].value_counts()

Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
Allergy      

The column labeled 'PROGNOSIS' represents all the diseases in our datasets. There are 41 diseases in our dataset

In [16]:
## SEPERATING THE DATA FROM LABELS
X = data.drop(columns = 'diseases', axis=1)
Y = data['diseases']

In [None]:
#Printing X and Y
print(X)

In [18]:
#Printing Y
print(Y)

0                              Fungal infection
1                              Fungal infection
2                              Fungal infection
3                              Fungal infection
4                              Fungal infection
                         ...                   
4915    (vertigo) Paroymsal  Positional Vertigo
4916                                       Acne
4917                    Urinary tract infection
4918                                  Psoriasis
4919                                   Impetigo
Name: diseases, Length: 4920, dtype: object


## DATA STANDARDIZATION

In [19]:
scaler = StandardScaler()

In [20]:
scaler.fit(X)

In [21]:
standardized_data = scaler.transform(X)

In [22]:
print(standardized_data)

[[ 2.50132708  2.29336913  6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913  6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 [ 2.50132708 -0.4360397   6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 ...
 [-0.39978778 -0.4360397  -0.14981285 ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913 -0.14981285 ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913 -0.14981285 ...  6.49291111  6.49291111
   6.49291111]]


In [24]:
X = standardized_data
Y = data['diseases']

In [25]:
print(X)
print(Y)

[[ 2.50132708  2.29336913  6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913  6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 [ 2.50132708 -0.4360397   6.6749948  ... -0.15401412 -0.15401412
  -0.15401412]
 ...
 [-0.39978778 -0.4360397  -0.14981285 ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913 -0.14981285 ... -0.15401412 -0.15401412
  -0.15401412]
 [-0.39978778  2.29336913 -0.14981285 ...  6.49291111  6.49291111
   6.49291111]]
0                              Fungal infection
1                              Fungal infection
2                              Fungal infection
3                              Fungal infection
4                              Fungal infection
                         ...                   
4915    (vertigo) Paroymsal  Positional Vertigo
4916                                       Acne
4917                    Urinary tract infection
4918                                  Psoriasis
4919                        

Train Test Split

In [26]:
#X_test=pd.read_csv('../Data/Testing.csv')
#Y_train = X_test['prognosis']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(4920, 132) (3936, 132) (984, 132)


## TRAINING THE MODEL

### SUPPORT VECTOR MODEL

In [28]:
classifier = svm.SVC(kernel='linear')

##TRAINING THE SUPPORT VECTOR MACHINE CLASSIFIER
classifier.fit(X_train, Y_train)

### KMEANS MODEL

In [46]:
# for i in range(1,11):
#     kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state=0)
#     kmeans.fit(X)
#     wss.append(kmeans.intertia_)



NameError: name 'wss' is not defined

## MODEL EVALUATION

#### ACCURACY SCORE


In [30]:
##accuracy score on the TRAINING DATA
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [31]:
##Printing the accuracy score of the training data
print("Accuracy score of the training data is: ", training_data_accuracy)

Accuracy score of the training data is:  1.0


In [32]:
##accuracy score on the TESTING DATA
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [33]:
##Printing the accuracy score of the testing data
print("Accuracy score of the testing data is: ", testing_data_accuracy)

Accuracy score of the testing data is:  1.0


## MAKING A PREDICTIVE SYSTEM

In [38]:
input_data =(1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)

#changing the input_data to numpy array
input_data_as_array = np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_array.reshape(1, -1)

#standardizing the input data
std_data = scaler.transform(input_data_reshaped)
#print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

['Drug Reaction']


