In [1]:
import os
import pandas as pd
import numpy as np


# Get current working directory
current_dir = os.getcwd()

# Navigate to data folder
data_dir = os.path.join(current_dir, "data")

# Read CSV file
df = pd.read_csv(os.path.join(data_dir, "faers_asthma.csv"), sep=",")


In [2]:
print(df.head)

<bound method NDFrame.head of        Case ID Suspect Product Names   Suspect Product Active Ingredients  \
0      4974950           A-Methapred  Methylprednisolone Sodium Succinate   
1      4974946           A-Methapred  Methylprednisolone Sodium Succinate   
2      4652544           A-Methapred  Methylprednisolone Sodium Succinate   
3      4656790           A-Methapred  Methylprednisolone Sodium Succinate   
4      4610124           A-Methapred  Methylprednisolone Sodium Succinate   
...        ...                   ...                                  ...   
92319  4230505                     -                       Nitrofurantoin   
92320  4236734                     -                           Gentamicin   
92321  4235917                     -                              Aspirin   
92322  4234961                     -               Gold Sodium Thiomalate   
92323  4227900                     -                              Aspirin   

      Reason for Use                         

In [3]:
#Create Patient Age column to be only numeric and replace missings with the mean
df['Patient Age'] = df['Patient Age'].str.extract('(\d+)').astype(float)
mean_age = df['Patient Age'].mean()
df['Patient Age'].fillna(mean_age, inplace=True)


In [4]:
#Create Patient Weight column to be only numeric and replace missings with the meandf['Patient Weight'] = df['Patient Weight'].str.extract('(\d+)').astype(float)
df['Patient Weight'] = df['Patient Weight'].str.extract('(\d+)').astype(float)
mean_weight = df['Patient Weight'].mean()
df['Patient Weight'].fillna(mean_weight, inplace=True)

In [5]:
print(df.columns)

Index(['Case ID', 'Suspect Product Names',
       'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions',
       'Serious', 'Outcomes', 'Sex', 'Event Date', 'Latest FDA Received Date',
       'Case Priority', 'Patient Age', 'Patient Weight', 'Sender',
       'Reporter Type', 'Report Source', 'Concomitant Product Names',
       'Latest Manufacturer Received Date', 'Initial FDA Received Date',
       'Country where Event occurred', 'Reported to Manufacturer?',
       'Manufacturer Control Number', 'Literature Reference',
       'Compounded Flag'],
      dtype='object')


In [6]:

# Replace values with 1 and 0 for 'Serious' variable
df['Serious'] = df['Serious'].replace({'Serious': 1, 'Non-Serious': 0})

# Convert variable to categorical variable
df['Serious'] = df['Serious'].astype('category')

df['Serious']

0        0
1        1
2        1
3        0
4        1
        ..
92319    1
92320    1
92321    1
92322    1
92323    1
Name: Serious, Length: 92324, dtype: category
Categories (2, int64): [0, 1]

In [7]:

# Replace values with 1 and 0 for 'Case Priority' variable
df['Case Priority'] = df['Serious'].replace({'Expedited': 1, 'Non-Expedited': 0, 'Direct': 2, '30-DAY': 3, '5-Day': 4, 'BSR': 5})

# Convert variable to categorical variable
df['Case Priority'] = df['Case Priority'].astype('category')

df['Case Priority']

0        0
1        1
2        1
3        0
4        1
        ..
92319    1
92320    1
92321    1
92322    1
92323    1
Name: Case Priority, Length: 92324, dtype: category
Categories (2, int64): [0, 1]

In [8]:

# Replace values with 1 and 0 for 'Sex' variable
df['Sex'] = df['Sex'].replace({'Male': 1, 'Female': 2, 'Direct': 2, 'Not Specified': 0})

# Convert variable to categorical variable
df['Sex'] = df['Case Priority'].astype('category')

df['Sex']

0        0
1        1
2        1
3        0
4        1
        ..
92319    1
92320    1
92321    1
92322    1
92323    1
Name: Sex, Length: 92324, dtype: category
Categories (2, int64): [0, 1]

In [9]:

# Replace values with 1 and 0 for 'Reported to Manufacturer?' variable
df['Reported to Manufacturer?'] = df['Reported to Manufacturer?'].replace({'Y': 1, 'Not Specified': 0})

# Convert variable to categorical variable
df['Reported to Manufacturer?'] = df['Reported to Manufacturer?'].astype('category')

df['Reported to Manufacturer?']

0        0
1        0
2        0
3        0
4        0
        ..
92319    0
92320    0
92321    0
92322    0
92323    0
Name: Reported to Manufacturer?, Length: 92324, dtype: category
Categories (2, int64): [0, 1]

In [10]:
# Replace values with 1,2, and 0 for 'Reporter Type' variable
df['Reporter Type'] = df['Reporter Type'].replace({'Consumer': 1, 'Healthcare Professional': 2, 'Not Specified': 0})

# Convert variable to categorical variable
df['Reporter Type'] = df['Reporter Type'].astype('category')

df['Reporter Type']

0        1
1        1
2        1
3        1
4        2
        ..
92319    2
92320    2
92321    2
92322    2
92323    2
Name: Reporter Type, Length: 92324, dtype: category
Categories (3, int64): [0, 1, 2]

In [11]:
cleaned_data = df[['Suspect Product Names', 'Reason for Use', 'Suspect Product Active Ingredients', 'Reactions', 'Serious', 'Sex',
                   'Patient Age', 'Patient Weight','Reporter Type','Report Source', 'Concomitant Product Names','Country where Event occurred', 'Outcomes','Reported to Manufacturer?',]]
print(cleaned_data)


      Suspect Product Names Reason for Use  \
0               A-Methapred              -   
1               A-Methapred              -   
2               A-Methapred              -   
3               A-Methapred              -   
4               A-Methapred              -   
...                     ...            ...   
92319                     -              -   
92320                     -              -   
92321                     -              -   
92322                     -              -   
92323                     -              -   

        Suspect Product Active Ingredients  \
0      Methylprednisolone Sodium Succinate   
1      Methylprednisolone Sodium Succinate   
2      Methylprednisolone Sodium Succinate   
3      Methylprednisolone Sodium Succinate   
4      Methylprednisolone Sodium Succinate   
...                                    ...   
92319                       Nitrofurantoin   
92320                           Gentamicin   
92321                            

In [12]:
#cleaned_data.to_csv('cleaned_data.csv', index=False)

In [13]:

# select the columns of interest
cols_of_interest = ['Suspect Product Names', 'Reason for Use', 'Suspect Product Active Ingredients',
                    'Reactions', 'Serious', 'Sex', 'Patient Age', 'Patient Weight',
                    'Reporter Type', 'Report Source', 'Concomitant Product Names',
                    'Country where Event occurred', 'Outcomes', 'Reported to Manufacturer?']

# get basic descriptive statistics for the numeric columns
numeric_cols = ['Patient Age', 'Patient Weight']
print(cleaned_data[numeric_cols].describe())


# get basic descriptive statistics for the categorical columns
categorical_cols = list(set(cols_of_interest) - set(numeric_cols))
for col in categorical_cols:
    print(cleaned_data[col].value_counts())


        Patient Age  Patient Weight
count  92324.000000    92324.000000
mean     271.105004       94.413211
std     1800.906524       24.281070
min        0.000000        0.000000
25%       48.000000       94.413211
50%       66.000000       94.413211
75%      271.105004       94.413211
max    35248.000000      559.000000
0    92022
1      302
Name: Reported to Manufacturer?, dtype: int64
1    47467
2    41687
0     3170
Name: Reporter Type, dtype: int64
-                                                                                                                                                                                                                                            18318
Asthma                                                                                                                                                                                                                                       16960
Product Used For Unknown Indication                    

In [14]:
print(cleaned_data)

      Suspect Product Names Reason for Use  \
0               A-Methapred              -   
1               A-Methapred              -   
2               A-Methapred              -   
3               A-Methapred              -   
4               A-Methapred              -   
...                     ...            ...   
92319                     -              -   
92320                     -              -   
92321                     -              -   
92322                     -              -   
92323                     -              -   

        Suspect Product Active Ingredients  \
0      Methylprednisolone Sodium Succinate   
1      Methylprednisolone Sodium Succinate   
2      Methylprednisolone Sodium Succinate   
3      Methylprednisolone Sodium Succinate   
4      Methylprednisolone Sodium Succinate   
...                                    ...   
92319                       Nitrofurantoin   
92320                           Gentamicin   
92321                            