# ParkinsonPy
---

## Starter Code to Import Libraries and Load the Data

In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the CSV files into a Pandas DataFrame
parkinson_clinical_df = pd.read_csv("C:/Users/Marc Roca/Documents/Predicting_Parkinsons_Progression/PD-datasets/train_clinical_data.csv")
protein_df = pd.read_csv("C:/Users/Marc Roca/Documents/Predicting_Parkinsons_Progression/PD-datasets/train_proteins.csv")
peptides_df = pd.read_csv("C:/Users/Marc Roca/Documents/Predicting_Parkinsons_Progression/PD-datasets/train_peptides.csv")


# Display sample data
parkinson_clinical_df.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [3]:
protein_df.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [4]:
peptides_df.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [5]:
# Help to get the columns title
# parkinson_clinical_df.columns
# protein_df.columns
# peptides_df.columns

In [6]:
## Dtypes of the dfs
print("clinical df:")
print(parkinson_clinical_df.dtypes)
print("protein df:")
print(protein_df.dtypes)
print("peptides df:")
print(peptides_df.dtypes)

clinical df:
visit_id                                object
patient_id                               int64
visit_month                              int64
updrs_1                                float64
updrs_2                                float64
updrs_3                                float64
updrs_4                                float64
upd23b_clinical_state_on_medication     object
dtype: object
protein df:
visit_id        object
visit_month      int64
patient_id       int64
UniProt         object
NPX            float64
dtype: object
peptides df:
visit_id             object
visit_month           int64
patient_id            int64
UniProt              object
Peptide              object
PeptideAbundance    float64
dtype: object


In [7]:
### Get unique values number of specific columns
print(protein_df['UniProt'].nunique())
print(peptides_df['Peptide'].nunique())

227
968


# Uniprot DF

In [8]:
edited_protein_df= protein_df[['visit_id', 'UniProt']]
edited_protein_df

Unnamed: 0,visit_id,UniProt
0,55_0,O00391
1,55_0,O00533
2,55_0,O00584
3,55_0,O14498
4,55_0,O14773
...,...,...
232736,58648_108,Q9UBX5
232737,58648_108,Q9UHG2
232738,58648_108,Q9UKV8
232739,58648_108,Q9Y646


In [9]:
### loop to get the proteins that a patient has per visit_id
x = "55_0"
x_last = "58648_108	"
protein_list = []
summary_list = []

for row in edited_protein_df.iterrows():
    if x != row[1][0]:
        #Dictionary
        summary_dict = {
            "visit_id": x,
            "UniProt": protein_list,
            }
        #appending values
        summary_list.append(summary_dict)
        
        x = row[1][0]
        protein_list = []
        UniProt = row[1][1]
        protein_list.append(UniProt)
        
    elif x == row[1][0]:
        UniProt = row[1][1]
        protein_list.append(UniProt)
        
print(summary_list)

[{'visit_id': '55_0', 'UniProt': ['O00391', 'O00533', 'O00584', 'O14498', 'O14773', 'O14791', 'O15240', 'O15394', 'O43505', 'O60888', 'O75144', 'O75326', 'O94919', 'P00441', 'P00450', 'P00734', 'P00736', 'P00738', 'P00746', 'P00747', 'P00748', 'P00751', 'P01008', 'P01009', 'P01011', 'P01019', 'P01023', 'P01024', 'P01031', 'P01033', 'P01034', 'P01042', 'P01344', 'P01591', 'P01608', 'P01621', 'P01717', 'P01780', 'P01833', 'P01834', 'P01857', 'P01859', 'P01860', 'P01861', 'P01876', 'P01877', 'P02452', 'P02647', 'P02649', 'P02652', 'P02655', 'P02656', 'P02671', 'P02675', 'P02679', 'P02747', 'P02748', 'P02749', 'P02750', 'P02751', 'P02753', 'P02760', 'P02763', 'P02765', 'P02766', 'P02768', 'P02774', 'P02787', 'P02790', 'P04004', 'P04075', 'P04156', 'P04180', 'P04196', 'P04207', 'P04211', 'P04216', 'P04217', 'P04275', 'P04406', 'P04433', 'P05060', 'P05067', 'P05090', 'P05155', 'P05156', 'P05408', 'P05452', 'P05546', 'P06310', 'P06396', 'P06454', 'P06681', 'P06727', 'P07195', 'P07225', 'P0733

In [10]:
#Turning summary_list into a dataframe
Uniprot_df = pd.DataFrame(summary_list)
Uniprot_df

Unnamed: 0,visit_id,UniProt
0,55_0,"[O00391, O00533, O00584, O14498, O14773, O1479..."
1,1517_0,"[O00391, O00533, O00584, O14773, O14791, O1524..."
2,1923_0,"[O00391, O00533, O00584, O14773, O15240, O1539..."
3,2660_0,"[O00533, O00584, O14498, O14773, O14791, O1524..."
4,3636_0,"[O00391, O00533, O00584, O14498, O14773, O1479..."
...,...,...
1107,5645_96,"[O00391, O00533, O00584, O14498, O14773, O1479..."
1108,58648_96,"[O00533, O00584, O14498, O14773, O14791, O1524..."
1109,5645_108,"[O00391, O00533, O00584, O14498, O14773, O1524..."
1110,12703_108,"[O00391, O00533, O00584, O14498, O14773, O1479..."


# New DF- Uniprot_peptide_df

In [11]:
### get DF I want to use
edited_peptides_df = peptides_df[['UniProt', 'Peptide']]
edited_peptides_df

Unnamed: 0,UniProt,Peptide
0,O00391,NEQEQPLGQWHLS
1,O00533,GNPEPTFSWTK
2,O00533,IEIPSSVQQVPTIIK
3,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK
4,O00533,SMEQNGPGLEYR
...,...,...
981829,Q9UHG2,ILAGSADSEGVAAPR
981830,Q9UKV8,SGNIPAGTTVDTK
981831,Q9Y646,LALLVDTVGPR
981832,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R


In [12]:
### Drop values that are duplicated
edited_peptides_df =edited_peptides_df.drop_duplicates(subset=['UniProt', 'Peptide'])
edited_peptides_df

Unnamed: 0,UniProt,Peptide
0,O00391,NEQEQPLGQWHLS
1,O00533,GNPEPTFSWTK
2,O00533,IEIPSSVQQVPTIIK
3,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK
4,O00533,SMEQNGPGLEYR
...,...,...
2046,P02647,QKVEPLRAELQEGAR
2223,P02774,RTHLPEVFLSK
2252,P02787,INHC(UniMod_4)RFDEFFSEGC(UniMod_4)APGSKK
2778,P00748,EQPPSLTR


In [13]:
### Sort values in terms of Uniprot
edited_peptides_df = edited_peptides_df.sort_values(["UniProt"], ascending = False).reset_index()
edited_peptides_df = edited_peptides_df[['UniProt', 'Peptide']]
edited_peptides_df.head()

Unnamed: 0,UniProt,Peptide
0,Q9Y6R7,AGC(UniMod_4)VAESTAVC(UniMod_4)R
1,Q9Y6R7,GATTSPGVYELSSR
2,Q9Y646,LALLVDTVGPR
3,Q9UNU6,KNM(UniMod_35)FEFLK
4,Q9UKV8,SGNIPAGTTVDTK


In [14]:
### Loop to get a list of peptides that make up a protein
x = "Q9Y6R7"
peptides_list = []
summary_list = []

for row in edited_peptides_df.iterrows():
    if x != row[1][0]:
        #Dictionary
        summary_dict = {
            "UniProt": x,
            "Peptide": peptides_list,
            }
        #appending values
        summary_list.append(summary_dict)
        
        x = row[1][0]
        peptides_list = []
        Peptide = row[1][1]
        peptides_list.append(Peptide)
        
    elif x == row[1][0]:
        Peptide = row[1][1]
        peptides_list.append(Peptide)
        
print(summary_list)

[{'UniProt': 'Q9Y6R7', 'Peptide': ['AGC(UniMod_4)VAESTAVC(UniMod_4)R', 'GATTSPGVYELSSR']}, {'UniProt': 'Q9Y646', 'Peptide': ['LALLVDTVGPR']}, {'UniProt': 'Q9UNU6', 'Peptide': ['KNM(UniMod_35)FEFLK']}, {'UniProt': 'Q9UKV8', 'Peptide': ['SGNIPAGTTVDTK']}, {'UniProt': 'Q9UHG2', 'Peptide': ['AEAQEAEDQQAR', 'ARAEAQEAEDQQAR', 'GEAAGAVQELAR', 'ILAGSADSEGVAAPR']}, {'UniProt': 'Q9UBX5', 'Peptide': ['C(UniMod_4)MC(UniMod_4)PAENPGC(UniMod_4)R', 'DQPFTILYR']}, {'UniProt': 'Q9UBR2', 'Peptide': ['NVDGVNYASITR']}, {'UniProt': 'Q9NYU2', 'Peptide': ['FTILDSQGK']}, {'UniProt': 'Q9NQ79', 'Peptide': ['GVALADFNR', 'GVASLFAGR']}, {'UniProt': 'Q9HDC9', 'Peptide': ['VLLDQLR']}, {'UniProt': 'Q9BY67', 'Peptide': ['VHKEDDGVPVIC(UniMod_4)QVEHPAVTGNLQTQR']}, {'UniProt': 'Q99969', 'Peptide': ['EAEEHQETQC(UniMod_4)LR']}, {'UniProt': 'Q99832', 'Peptide': ['VPEEDLKR']}, {'UniProt': 'Q99829', 'Peptide': ['QALPQVR']}, {'UniProt': 'Q99683', 'Peptide': ['LLEELVR']}, {'UniProt': 'Q99674', 'Peptide': ['HVEPGEPLAPSPQEPQAVGR'

In [15]:
### Convert List into a DF
Uniprot_peptide_df = pd.DataFrame(summary_list)
Uniprot_peptide_df

Unnamed: 0,UniProt,Peptide
0,Q9Y6R7,"[AGC(UniMod_4)VAESTAVC(UniMod_4)R, GATTSPGVYEL..."
1,Q9Y646,[LALLVDTVGPR]
2,Q9UNU6,[KNM(UniMod_35)FEFLK]
3,Q9UKV8,[SGNIPAGTTVDTK]
4,Q9UHG2,"[AEAQEAEDQQAR, ARAEAQEAEDQQAR, GEAAGAVQELAR, I..."
...,...,...
221,O14791,[VTEPISAESGEQVER]
222,O14773,"[LFGGNFAHQASVAR, LYQQHGAGLFDVTR]"
223,O14498,[ALPGTPVASSQPR]
224,O00584,"[ELDLNSVLLK, HGTC(UniMod_4)AAQVDALNSQKK]"


# Summary- Group by

In [16]:
### Group edited_protein_df
grouped_edited_protein_df = edited_protein_df.groupby(["visit_id"]).count()

In [17]:
### Look at df
grouped_edited_protein_df

Unnamed: 0_level_0,UniProt
visit_id,Unnamed: 1_level_1
10053_0,165
10053_12,171
10053_18,208
10138_12,217
10138_24,219
...,...
8699_24,216
942_12,212
942_24,217
942_48,216


In [18]:
### Sort DF
grouped_edited_protein_df =grouped_edited_protein_df.sort_values(["UniProt"], ascending = False)
grouped_edited_protein_df

Unnamed: 0_level_0,UniProt
visit_id,Unnamed: 1_level_1
47171_6,224
48780_6,224
55_36,224
47171_12,224
27715_36,224
...,...
58648_6,159
18560_48,157
58648_84,157
11928_0,138


# Merging Data Frame

In [19]:
###Merging
merged_df = parkinson_clinical_df.merge(Uniprot_df,how = "left", on=['visit_id'])

In [20]:
###Viewing Data
merged_df

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt
0,55_0,55,0,10.0,6.0,15.0,,,"[O00391, O00533, O00584, O14498, O14773, O1479..."
1,55_3,55,3,10.0,7.0,25.0,,,
2,55_6,55,6,8.0,10.0,34.0,,,"[O00391, O00533, O00584, O14498, O14773, O1479..."
3,55_9,55,9,8.0,9.0,30.0,0.0,On,
4,55_12,55,12,10.0,10.0,41.0,0.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
...,...,...,...,...,...,...,...,...,...
2610,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479..."
2611,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,
2612,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,
2613,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,


In [21]:
### Other Merge because I dont know which one ill use

In [22]:
###Merging
merged_df_alterante = parkinson_clinical_df.merge(edited_protein_df,how = "left", on=['visit_id'])
merged_df_alterante

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt
0,55_0,55,0,10.0,6.0,15.0,,,O00391
1,55_0,55,0,10.0,6.0,15.0,,,O00533
2,55_0,55,0,10.0,6.0,15.0,,,O00584
3,55_0,55,0,10.0,6.0,15.0,,,O14498
4,55_0,55,0,10.0,6.0,15.0,,,O14773
...,...,...,...,...,...,...,...,...,...
224810,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9Y6R7
224811,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,
224812,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,
224813,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,


# Alternate big df

In [23]:
### Joins peptides df with merged_df_alterante
### I will perform the cleaning in this section as well

In [24]:
###Merging
merged_df_alterante_2 = merged_df_alterante.merge(Uniprot_peptide_df,how = "left", on=['UniProt'])
merged_df_alterante_2

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt,Peptide
0,55_0,55,0,10.0,6.0,15.0,,,O00391,
1,55_0,55,0,10.0,6.0,15.0,,,O00533,"[VIAVNEVGR, TLKIENVSYQDKGNYR, SMEQNGPGLEYR, KP..."
2,55_0,55,0,10.0,6.0,15.0,,,O00584,"[ELDLNSVLLK, HGTC(UniMod_4)AAQVDALNSQKK]"
3,55_0,55,0,10.0,6.0,15.0,,,O14498,[ALPGTPVASSQPR]
4,55_0,55,0,10.0,6.0,15.0,,,O14773,"[LFGGNFAHQASVAR, LYQQHGAGLFDVTR]"
...,...,...,...,...,...,...,...,...,...,...
224810,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9Y6R7,"[AGC(UniMod_4)VAESTAVC(UniMod_4)R, GATTSPGVYEL..."
224811,65043_54,65043,54,4.0,8.0,11.0,1.0,Off,,
224812,65043_60,65043,60,6.0,6.0,16.0,1.0,Off,,
224813,65043_72,65043,72,3.0,9.0,14.0,1.0,Off,,


# Cleaning Data

In [25]:
##getting info
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615 entries, 0 to 2614
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   visit_id                             2615 non-null   object 
 1   patient_id                           2615 non-null   int64  
 2   visit_month                          2615 non-null   int64  
 3   updrs_1                              2614 non-null   float64
 4   updrs_2                              2613 non-null   float64
 5   updrs_3                              2590 non-null   float64
 6   updrs_4                              1577 non-null   float64
 7   upd23b_clinical_state_on_medication  1288 non-null   object 
 8   UniProt                              1067 non-null   object 
dtypes: float64(4), int64(2), object(3)
memory usage: 204.3+ KB


In [26]:
##dropping cells
merged_df_clean =merged_df.dropna()

In [27]:
##view new df
merged_df_clean

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt
4,55_12,55,12,10.0,10.0,41.0,0.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
8,55_36,55,36,17.0,18.0,51.0,0.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
28,1517_0,1517,0,11.0,6.0,25.0,5.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524..."
32,1517_24,1517,24,19.0,11.0,28.0,3.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
34,1517_36,1517,36,20.0,17.0,31.0,8.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
...,...,...,...,...,...,...,...,...,...
2581,64669_60,64669,60,15.0,15.0,38.0,0.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524..."
2594,64674_48,64674,48,11.0,17.0,46.0,1.0,On,"[O00533, O00584, O14498, O14773, O14791, O1524..."
2598,64674_84,64674,84,11.0,15.0,45.0,4.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524..."
2604,65043_12,65043,12,4.0,7.0,14.0,0.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479..."


In [28]:
### Alternate df cleaning
merged_df_alterante_clean = merged_df_alterante.dropna()
merged_df_alterante_clean

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt
439,55_12,55,12,10.0,10.0,41.0,0.0,On,O00391
440,55_12,55,12,10.0,10.0,41.0,0.0,On,O00533
441,55_12,55,12,10.0,10.0,41.0,0.0,On,O00584
442,55_12,55,12,10.0,10.0,41.0,0.0,On,O14498
443,55_12,55,12,10.0,10.0,41.0,0.0,On,O14773
...,...,...,...,...,...,...,...,...,...
224806,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9UBX5
224807,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9UHG2
224808,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9UKV8
224809,65043_48,65043,48,7.0,6.0,13.0,0.0,Off,Q9Y646


# Analysis- with UPDRS_3 -- Alternate version

In [29]:
merged_df_clean.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,UniProt
4,55_12,55,12,10.0,10.0,41.0,0.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
8,55_36,55,36,17.0,18.0,51.0,0.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
28,1517_0,1517,0,11.0,6.0,25.0,5.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524..."
32,1517_24,1517,24,19.0,11.0,28.0,3.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
34,1517_36,1517,36,20.0,17.0,31.0,8.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."


In [30]:
merged_df_clean.columns

Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication', 'UniProt'],
      dtype='object')

In [31]:
df_analysis_updrs_3 = merged_df_clean[['visit_id', 'visit_month',
       'updrs_3', 'upd23b_clinical_state_on_medication', 'UniProt']]
df_analysis_updrs_3.head()

Unnamed: 0,visit_id,visit_month,updrs_3,upd23b_clinical_state_on_medication,UniProt
4,55_12,12,41.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
8,55_36,36,51.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
28,1517_0,0,25.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524..."
32,1517_24,24,28.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."
34,1517_36,36,31.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479..."


In [32]:
###32 and below is mild, 59 and above is severe

### Loop to get a list of peptides that make up a protein
updrs_3_bin = []

for row in df_analysis_updrs_3.iterrows():
    value_updrs = row[1][2]
    
    if value_updrs <= 32:
         updrs_3_bin.append("mild")
    else:
        updrs_3_bin.append("severe")


In [33]:
### Create new column
df_analysis_updrs_3['updrs_3_bin'] = updrs_3_bin
df_analysis_updrs_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis_updrs_3['updrs_3_bin'] = updrs_3_bin


Unnamed: 0,visit_id,visit_month,updrs_3,upd23b_clinical_state_on_medication,UniProt,updrs_3_bin
4,55_12,12,41.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe
8,55_36,36,51.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe
28,1517_0,0,25.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524...",mild
32,1517_24,24,28.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild
34,1517_36,36,31.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild
...,...,...,...,...,...,...
2581,64669_60,60,38.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe
2594,64674_48,48,46.0,On,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe
2598,64674_84,84,45.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe
2604,65043_12,12,14.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild


In [34]:
### Get protien len

In [35]:
### Loop to get a list of peptides that make up a protein
UniProt_len = []

for row in df_analysis_updrs_3.iterrows():
    uniprot = row[1][4]
    UniProt_len.append(len(uniprot))

# print(UniProt_len)

In [36]:
### Create new column
df_analysis_updrs_3['UniProt_len'] = UniProt_len
df_analysis_updrs_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis_updrs_3['UniProt_len'] = UniProt_len


Unnamed: 0,visit_id,visit_month,updrs_3,upd23b_clinical_state_on_medication,UniProt,updrs_3_bin,UniProt_len
4,55_12,12,41.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,222
8,55_36,36,51.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,224
28,1517_0,0,25.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524...",mild,217
32,1517_24,24,28.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
34,1517_36,36,31.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
...,...,...,...,...,...,...,...
2581,64669_60,60,38.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,201
2594,64674_48,48,46.0,On,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,210
2598,64674_84,84,45.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,207
2604,65043_12,12,14.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,218


In [37]:
df_analysis_updrs_3=df_analysis_updrs_3.reset_index()

In [38]:
df_analysis_updrs_3  = df_analysis_updrs_3[['visit_id', 'visit_month', 'updrs_3',
       'upd23b_clinical_state_on_medication', 'UniProt', 'updrs_3_bin',
       'UniProt_len']] 

In [39]:
df_analysis_updrs_3

Unnamed: 0,visit_id,visit_month,updrs_3,upd23b_clinical_state_on_medication,UniProt,updrs_3_bin,UniProt_len
0,55_12,12,41.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,222
1,55_36,36,51.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,224
2,1517_0,0,25.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524...",mild,217
3,1517_24,24,28.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
4,1517_36,36,31.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
...,...,...,...,...,...,...,...
436,64669_60,60,38.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,201
437,64674_48,48,46.0,On,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,210
438,64674_84,84,45.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,207
439,65043_12,12,14.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,218


In [40]:
df_analysis_updrs_3.columns

Index(['visit_id', 'visit_month', 'updrs_3',
       'upd23b_clinical_state_on_medication', 'UniProt', 'updrs_3_bin',
       'UniProt_len'],
      dtype='object')

In [41]:
df_analysis_updrs_3.dtypes

visit_id                                object
visit_month                              int64
updrs_3                                float64
upd23b_clinical_state_on_medication     object
UniProt                                 object
updrs_3_bin                             object
UniProt_len                              int64
dtype: object

In [42]:
df_columns_scaled = StandardScaler().fit_transform(df_analysis_updrs_3[['visit_month', 'updrs_3', 'UniProt_len']])

In [43]:
 # Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
df_scaled = pd.DataFrame(
    df_columns_scaled,
    columns=['visit_month', 'updrs_3', 'UniProt_len']
)

# Create a Ticker column in the df_stocks_scaled DataFrame
# using the index of the original df_stocks DataFrame
df_scaled["visit_id"] = df_analysis_updrs_3["visit_id"]

# Set the newly created Ticker column as index of the df_stocks_scaled DataFrame
df_scaled = df_scaled.set_index("visit_id")

# Review the DataFrame
df_scaled.head()

Unnamed: 0_level_0,visit_month,updrs_3,UniProt_len
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
55_12,-1.093514,1.110269,1.053902
55_36,0.039834,1.866218,1.1906
1517_0,-1.660188,-0.099251,0.712159
1517_24,-0.52684,0.127534,0.917205
1517_36,0.039834,0.354319,0.917205


In [44]:
#### Get dummies

In [45]:
df_analysis_updrs_3_scaled = df_analysis_updrs_3.set_index("visit_id")
df_analysis_updrs_3_scaled

Unnamed: 0_level_0,visit_month,updrs_3,upd23b_clinical_state_on_medication,UniProt,updrs_3_bin,UniProt_len
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
55_12,12,41.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,222
55_36,36,51.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",severe,224
1517_0,0,25.0,On,"[O00391, O00533, O00584, O14773, O14791, O1524...",mild,217
1517_24,24,28.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
1517_36,36,31.0,On,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,220
...,...,...,...,...,...,...
64669_60,60,38.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,201
64674_48,48,46.0,On,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,210
64674_84,84,45.0,Off,"[O00533, O00584, O14498, O14773, O14791, O1524...",severe,207
65043_12,12,14.0,Off,"[O00391, O00533, O00584, O14498, O14773, O1479...",mild,218


In [46]:
 # Encode (convert to dummy variables) the EnergyType column
df_analysis_updrs_3_dummies_1 = pd.get_dummies(df_analysis_updrs_3_scaled["updrs_3_bin"])

# Review the DataFrame
df_analysis_updrs_3_dummies_1.head()

Unnamed: 0_level_0,mild,severe
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
55_12,0,1
55_36,0,1
1517_0,1,0
1517_24,1,0
1517_36,1,0


In [47]:
 # Encode (convert to dummy variables) the EnergyType column
df_analysis_updrs_3_dummies_2 = pd.get_dummies(df_analysis_updrs_3_scaled["upd23b_clinical_state_on_medication"])

# Review the DataFrame
df_analysis_updrs_3_dummies_2.head()

Unnamed: 0_level_0,Off,On
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
55_12,0,1
55_36,0,1
1517_0,0,1
1517_24,0,1
1517_36,0,1


In [48]:
### extra work

In [49]:
# df_analysis_updrs_3_scaled = df_scaled.set_index("visit_id")

In [50]:
# Concatenate the `EnergyType` encoded dummies with the scaled data DataFrame
df = pd.concat([df_scaled, df_analysis_updrs_3_dummies_2], axis=1)

# Display the sample data
df.head()

Unnamed: 0_level_0,visit_month,updrs_3,UniProt_len,Off,On
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
55_12,-1.093514,1.110269,1.053902,0,1
55_36,0.039834,1.866218,1.1906,0,1
1517_0,-1.660188,-0.099251,0.712159,0,1
1517_24,-0.52684,0.127534,0.917205,0,1
1517_36,0.039834,0.354319,0.917205,0,1


In [51]:
# # Concatenate the `EnergyType` encoded dummies with the scaled data DataFrame
# df = pd.concat([df, df_analysis_updrs_3_dummies_1], axis=1)

# # Display the sample data
# df.head()

In [52]:
### Analysis

In [53]:
 # Initialize the K-Means model with n_clusters=2
model = KMeans(n_clusters=2)

In [54]:
 # Fit the model for the df_stocks_scaled DataFrame
model.fit(df)



In [55]:
 # Predict the model segments (clusters)
df_clusters = model.predict(df)

# View the stock segments
print(df_clusters)

[1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0
 0 0 0 1 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 0 0 0
 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 1 1 0 1 1 0 1
 0 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0
 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1
 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 0 0 1 0
 0 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0]


In [56]:
 # Create a copy of the concatenated DataFrame
df_scaled_predictions = df.copy()

In [57]:
 # Create a new column in the copy of the concatenated DataFrame with the predicted clusters
df_scaled_predictions["Cluster_severe"] = df_clusters

# Review the DataFrame
df_scaled_predictions.head()

Unnamed: 0_level_0,visit_month,updrs_3,UniProt_len,Off,On,Cluster_severe
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
55_12,-1.093514,1.110269,1.053902,0,1,1
55_36,0.039834,1.866218,1.1906,0,1,1
1517_0,-1.660188,-0.099251,0.712159,0,1,0
1517_24,-0.52684,0.127534,0.917205,0,1,0
1517_36,0.039834,0.354319,0.917205,0,1,1


In [None]:
### Analyzing model

In [None]:
from sklearn.metrics import classification_report