In [1]:
import os
import pandas as pd
import numpy as np


# Get current working directory
current_dir = os.getcwd()

# Navigate to data folder
data_dir = os.path.join(current_dir, "data")

# Read CSV file
df = pd.read_csv(os.path.join(data_dir, "faers_asthma.csv"), sep=",")


In [2]:
print(df.head)

<bound method NDFrame.head of        Case ID Suspect Product Names   Suspect Product Active Ingredients  \
0      4974950           A-Methapred  Methylprednisolone Sodium Succinate   
1      4974946           A-Methapred  Methylprednisolone Sodium Succinate   
2      4652544           A-Methapred  Methylprednisolone Sodium Succinate   
3      4656790           A-Methapred  Methylprednisolone Sodium Succinate   
4      4610124           A-Methapred  Methylprednisolone Sodium Succinate   
...        ...                   ...                                  ...   
92319  4230505                     -                       Nitrofurantoin   
92320  4236734                     -                           Gentamicin   
92321  4235917                     -                              Aspirin   
92322  4234961                     -               Gold Sodium Thiomalate   
92323  4227900                     -                              Aspirin   

      Reason for Use                         

In [3]:
#Create Patient Age column to be only numeric and replace missings with the mean
df['Patient Age'] = df['Patient Age'].str.extract('(\d+)').astype(float)



In [4]:
#Create Patient Weight column to be only numeric and replace missings with the meandf['Patient Weight'] = df['Patient Weight'].str.extract('(\d+)').astype(float)
df['Patient Weight'] = df['Patient Weight'].str.extract('(\d+)').astype(float)


In [5]:
# calculate the IQR for Patient Age and Patient Weight
q1_age = df['Patient Age'].quantile(0.25)
q3_age = df['Patient Age'].quantile(0.75)
iqr_age = q3_age - q1_age

q1_weight = df['Patient Weight'].quantile(0.25)
q3_weight = df['Patient Weight'].quantile(0.75)
iqr_weight = q3_weight - q1_weight

# remove any data points outside of 1.5 times the IQR
df = df[(df['Patient Age'] >= q1_age - 1.5*iqr_age) & (df['Patient Age'] <= q3_age + 1.5*iqr_age)]
df = df[(df['Patient Weight'] >= q1_weight - 1.5*iqr_weight) & (df['Patient Weight'] <= q3_weight + 1.5*iqr_weight)]


In [6]:
mean_age = df['Patient Age'].mean()
df['Patient Age'].fillna(mean_age, inplace=True)

In [7]:
mean_weight = df['Patient Weight'].mean()
df['Patient Weight'].fillna(mean_weight, inplace=True)

In [8]:
print(df.columns)

Index(['Case ID', 'Suspect Product Names',
       'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions',
       'Serious', 'Outcomes', 'Sex', 'Event Date', 'Latest FDA Received Date',
       'Case Priority', 'Patient Age', 'Patient Weight', 'Sender',
       'Reporter Type', 'Report Source', 'Concomitant Product Names',
       'Latest Manufacturer Received Date', 'Initial FDA Received Date',
       'Country where Event occurred', 'Reported to Manufacturer?',
       'Manufacturer Control Number', 'Literature Reference',
       'Compounded Flag'],
      dtype='object')


In [9]:

# Replace values with 1 and 0 for 'Serious' variable
df['Serious'] = df['Serious'].replace({'Serious': 1, 'Non-Serious': 0})

# Convert variable to categorical variable
df['Serious'] = df['Serious'].astype('category')

df['Serious']

6        1
31       1
33       1
35       1
36       1
        ..
90968    1
90970    1
90974    1
90988    1
91045    1
Name: Serious, Length: 15655, dtype: category
Categories (2, int64): [0, 1]

In [10]:

# Replace values with 1 and 0 for 'Case Priority' variable
df['Case Priority'] = df['Serious'].replace({'Expedited': 1, 'Non-Expedited': 0, 'Direct': 2, '30-DAY': 3, '5-Day': 4, 'BSR': 5})

# Convert variable to categorical variable
df['Case Priority'] = df['Case Priority'].astype('category')

df['Case Priority']

6        1
31       1
33       1
35       1
36       1
        ..
90968    1
90970    1
90974    1
90988    1
91045    1
Name: Case Priority, Length: 15655, dtype: category
Categories (2, int64): [0, 1]

In [11]:

# Replace values with 1 and 0 for 'Sex' variable
df['Sex'] = df['Sex'].replace({'Male': 1, 'Female': 2, 'Direct': 2, 'Not Specified': 0})

# Convert variable to categorical variable
df['Sex'] = df['Case Priority'].astype('category')

df['Sex']

6        1
31       1
33       1
35       1
36       1
        ..
90968    1
90970    1
90974    1
90988    1
91045    1
Name: Sex, Length: 15655, dtype: category
Categories (2, int64): [0, 1]

In [12]:

# Replace values with 1 and 0 for 'Reported to Manufacturer?' variable
df['Reported to Manufacturer?'] = df['Reported to Manufacturer?'].replace({'Y': 1, 'Not Specified': 0})

# Convert variable to categorical variable
df['Reported to Manufacturer?'] = df['Reported to Manufacturer?'].astype('category')

df['Reported to Manufacturer?']

6        0
31       0
33       0
35       0
36       0
        ..
90968    0
90970    0
90974    0
90988    0
91045    0
Name: Reported to Manufacturer?, Length: 15655, dtype: category
Categories (2, int64): [0, 1]

In [13]:
# Replace values with 1,2, and 0 for 'Reporter Type' variable
df['Reporter Type'] = df['Reporter Type'].replace({'Consumer': 1, 'Healthcare Professional': 2, 'Not Specified': 0})

# Convert variable to categorical variable
df['Reporter Type'] = df['Reporter Type'].astype('category')

df['Reporter Type']

6        0
31       2
33       1
35       2
36       1
        ..
90968    2
90970    2
90974    1
90988    2
91045    2
Name: Reporter Type, Length: 15655, dtype: category
Categories (3, int64): [0, 1, 2]

In [14]:
cleaned_data = df[['Suspect Product Names', 'Reason for Use', 'Suspect Product Active Ingredients', 'Reactions', 'Serious', 'Sex',
                   'Patient Age', 'Patient Weight','Reporter Type','Report Source', 'Concomitant Product Names','Country where Event occurred', 'Outcomes','Reported to Manufacturer?',]]
print(cleaned_data)


      Suspect Product Names  \
6                    Aarane   
31               Abbokinase   
33               Abbokinase   
35               Abbokinase   
36               Abbokinase   
...                     ...   
90968                     -   
90970                     -   
90974                     -   
90988                     -   
91045                     -   

                                          Reason for Use  \
6      Asthma;Exposure To Allergen;Prophylaxis;Rhinit...   
31                                                     -   
33                                                     -   
35                                                     -   
36                                                     -   
...                                                  ...   
90968                                                  -   
90970                                                  -   
90974                                                  -   
90988                      

In [15]:
#cleaned_data.to_csv('cleaned_data.csv', index=False)

In [16]:

# select the columns of interest
cols_of_interest = ['Suspect Product Names', 'Reason for Use', 'Suspect Product Active Ingredients',
                    'Reactions', 'Serious', 'Sex', 'Patient Age', 'Patient Weight',
                    'Reporter Type', 'Report Source', 'Concomitant Product Names',
                    'Country where Event occurred', 'Outcomes', 'Reported to Manufacturer?']

# get basic descriptive statistics for the numeric columns
numeric_cols = ['Patient Age', 'Patient Weight']
print(cleaned_data[numeric_cols].describe())


# get basic descriptive statistics for the categorical columns
categorical_cols = list(set(cols_of_interest) - set(numeric_cols))
for col in categorical_cols:
    print(cleaned_data[col].value_counts())


        Patient Age  Patient Weight
count  15655.000000    15655.000000
mean      50.198722       85.915490
std       20.423576       35.978621
min        0.000000        0.000000
25%       38.000000       62.000000
50%       53.000000       80.000000
75%       65.000000      105.000000
max      101.000000      181.000000
Asthma                                                                                                                1240
Dyspnoea;Asthma                                                                                                         77
Condition Aggravated;Asthma                                                                                             66
Asthma;Dyspnoea                                                                                                         66
Drug Ineffective;Asthma                                                                                                 47
                                                             

In [17]:
print(cleaned_data)

      Suspect Product Names  \
6                    Aarane   
31               Abbokinase   
33               Abbokinase   
35               Abbokinase   
36               Abbokinase   
...                     ...   
90968                     -   
90970                     -   
90974                     -   
90988                     -   
91045                     -   

                                          Reason for Use  \
6      Asthma;Exposure To Allergen;Prophylaxis;Rhinit...   
31                                                     -   
33                                                     -   
35                                                     -   
36                                                     -   
...                                                  ...   
90968                                                  -   
90970                                                  -   
90974                                                  -   
90988                      