In [235]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

# Load the data
data = pd.read_csv("data.csv")

cols = data.columns
cols = cols[:-1] # Selecting all columns except the last one (disease)
print(cols)

df = data.iloc[:, :-1].astype(str)
print(df.head())

Index(['blackheads', 'cysts', 'inflammation', 'nodules', 'papules', 'pustules',
       'redness', 'whiteheads', 'coughing', 'fatigue',
       ...
       'rectal_pain', 'strong_urge_to_urinate', 'strong_smelling_urine',
       'red_eyes', 'male', 'female', 'infant', 'child', 'adult', 'senior'],
      dtype='object', length=632)
  blackheads cysts inflammation nodules papules pustules redness whiteheads  \
0          1     1            1       1       1        1       1          1   
1          1     1            1       1       1        1       1          1   
2          1     1            1       1       1        1       1          1   
3          1     1            1       1       1        1       1          1   
4          0     0            0       0       0        0       0          0   

  coughing fatigue  ... rectal_pain strong_urge_to_urinate  \
0        0       0  ...           0                      0   
1        0       0  ...           0                      0   
2        0

In [236]:
# process the data to get the list of symptoms
docs = []
for i in range(len(df)):
    item = []
    for j in range(len(cols)):
        if df.iloc[i][j] == '1':
            item.append(cols[j])
    docs.append(','.join(item))
    
print(docs)

['blackheads,cysts,inflammation,nodules,papules,pustules,redness,whiteheads,female,adult', 'blackheads,cysts,inflammation,nodules,papules,pustules,redness,whiteheads,male,adult', 'blackheads,cysts,inflammation,nodules,papules,pustules,redness,whiteheads,female,child', 'blackheads,cysts,inflammation,nodules,papules,pustules,redness,whiteheads,male,child', 'coughing,fatigue,headache,itchy_eyes,itchy_nose,itchy_throat,nasal_congestion,runny_nose,sneezing,watery_eyes,female,adult', 'coughing,fatigue,headache,itchy_eyes,itchy_nose,itchy_throat,nasal_congestion,runny_nose,sneezing,watery_eyes,male,adult', 'coughing,fatigue,itchy_eyes,itchy_throat,nasal_congestion,runny_nose,sneezing,watery_eyes,dark_circles_under_eyes,itchy_ears,female,child', 'coughing,fatigue,itchy_eyes,itchy_throat,nasal_congestion,runny_nose,sneezing,watery_eyes,dark_circles_under_eyes,itchy_ears,male,child', 'coughing,fatigue,itchy_eyes,itchy_throat,nasal_congestion,runny_nose,sneezing,watery_eyes,dark_circles_under_eye

In [224]:
#instantiate CountVectorizer() 
cv=CountVectorizer() 

# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)

word_count_vector.shape

(630, 635)

In [225]:
# compute the itf values
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 

# sort
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
female,1.682115
male,1.701103
fatigue,1.860057
adult,2.144001
fever,2.259920
...,...
underlying_medical_conditions,6.754159
lack_of_energy,6.754159
leg_pains,6.754159
darkening_of_skin,6.754159


In [237]:
# K Nearest Neighbors
k = 5 
knn_model = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
knn_model.fit(word_count_vector)

In [238]:
input_symptoms = ['blackheads', 'cysts', 'inflammation', 'nodules',]
input_vector = cv.transform(input_symptoms)
distances, indices = knn_model.kneighbors(input_vector)

top_diseases = data.iloc[indices[0]]['disease'].value_counts()
pd.DataFrame(top_diseases)

Unnamed: 0_level_0,count
disease,Unnamed: 1_level_1
acne,4
multiple_sclerosis,1


In [239]:
all_symptoms = []
for i in indices[-1]:
    symp = docs[i].split(',')
    symp = symp[: -2]
    all_symptoms += [s for s in symp if s not in input_symptoms]

# recommend the least common symptoms based on the idf score
all_symptoms = list(set(all_symptoms))
all_symptoms.sort(key=lambda x: df_idf.loc[x]['idf_weights'], reverse=True)

# print all symptoms with idf score
pd.DataFrame(all_symptoms, columns=['Symptom']).join(df_idf, on='Symptom')


Unnamed: 0,Symptom,idf_weights
0,unsteady_gait,6.348694
1,weakness_in_limbs,6.348694
2,lack_of_coordination,6.348694
3,tremor,6.348694
4,electric_shock_sensations_on_neck_movement,6.348694
5,pustules,5.837868
6,whiteheads,5.837868
7,papules,5.837868
8,slurred_speech,5.250081
9,double_vision,5.250081


In [240]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(knn_model, f)
    pickle.dump(cv, f)
    pickle.dump(df_idf, f)
    pickle.dump(data, f)
    pickle.dump(all_symptoms, f)
    pickle.dump(input_symptoms, f)
    pickle.dump(top_diseases, f)
    pickle.dump(indices, f)
    pickle.dump(distances, f)
    pickle.dump(k, f)
    pickle.dump(cols, f)
    pickle.dump(df, f)
    pickle.dump(docs, f)
    pickle.dump(word_count_vector, f)
    pickle.dump(tfidf_transformer, f)
    pickle.dump(input_vector, f)

print('Model saved successfully!')

Model saved successfully!
