# Autism Prediction in Adults

Data Preprocessing

In [1]:
# For adult dataset
import arff
import pandas as pd
# Load arff file
with open('Autism-Adult-Data.arff') as f:
    dataset = arff.load(f)
# Convert to DataFrame
adult = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])
# Display 
adult

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5.0,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8.0,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6.0,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,,no,no,Egypt,no,2.0,18 and more,,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0,1,0,1,1,0,1,1,1,1,...,f,White-European,no,no,Russia,no,7.0,18 and more,Self,YES
700,1,0,0,0,0,0,0,1,0,1,...,m,Hispanic,no,no,Mexico,no,3.0,18 and more,Parent,NO
701,1,0,1,1,1,0,1,1,0,1,...,f,,no,no,Russia,no,7.0,18 and more,,YES
702,1,0,0,1,1,0,1,0,1,1,...,m,South Asian,no,no,Pakistan,no,6.0,18 and more,Self,NO


In [2]:
# To display all the columns present in the dataset
pd.set_option('display.max_columns',None)

In [3]:
# To know the number of rows and columns in the dataset
adult.shape

(704, 21)

In [4]:
# The dataset contains values like ? so that I replaced those ? with nan values by using numpy library
import numpy as np
adult = adult.replace('?', np.nan)

In [5]:
# To know how many null values present in the column and and the datatype of each column
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         704 non-null    object 
 1   A2_Score         704 non-null    object 
 2   A3_Score         704 non-null    object 
 3   A4_Score         704 non-null    object 
 4   A5_Score         704 non-null    object 
 5   A6_Score         704 non-null    object 
 6   A7_Score         704 non-null    object 
 7   A8_Score         704 non-null    object 
 8   A9_Score         704 non-null    object 
 9   A10_Score        704 non-null    object 
 10  age              702 non-null    float64
 11  gender           704 non-null    object 
 12  ethnicity        609 non-null    object 
 13  jundice          704 non-null    object 
 14  austim           704 non-null    object 
 15  contry_of_res    704 non-null    object 
 16  used_app_before  704 non-null    object 
 17  result          

In [6]:
# To know what type of data is present in each column
for col in adult.columns:
    print(col,adult[col].unique())
    print("-"*50)

A1_Score ['1' '0']
--------------------------------------------------
A2_Score ['1' '0']
--------------------------------------------------
A3_Score ['1' '0']
--------------------------------------------------
A4_Score ['1' '0']
--------------------------------------------------
A5_Score ['0' '1']
--------------------------------------------------
A6_Score ['0' '1']
--------------------------------------------------
A7_Score ['1' '0']
--------------------------------------------------
A8_Score ['1' '0']
--------------------------------------------------
A9_Score ['0' '1']
--------------------------------------------------
A10_Score ['0' '1']
--------------------------------------------------
age [ 26.  24.  27.  35.  40.  36.  17.  64.  29.  33.  18.  31.  30.  34.
  38.  42.  43.  48.  37.  55.  50.  53.  20.  28.  21. 383.  47.  32.
  44.  nan  19.  58.  45.  22.  39.  25.  23.  54.  60.  41.  46.  56.
  61.  59.  52.  49.  51.]
--------------------------------------------------
gend

In [7]:
# The screening test scores contains data in string format. For training the model i have changed it from string to integer usng astype
adult['A1_Score']=adult['A1_Score'].astype(int)
adult['A2_Score']=adult['A2_Score'].astype(int)
adult['A3_Score']=adult['A3_Score'].astype(int)
adult['A4_Score']=adult['A4_Score'].astype(int)
adult['A5_Score']=adult['A5_Score'].astype(int)
adult['A6_Score']=adult['A6_Score'].astype(int)
adult['A7_Score']=adult['A7_Score'].astype(int)
adult['A8_Score']=adult['A8_Score'].astype(int)
adult['A9_Score']=adult['A9_Score'].astype(int)
adult['A10_Score']=adult['A10_Score'].astype(int)

In [8]:
# To check weather the datatype is changed or not
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         704 non-null    int32  
 1   A2_Score         704 non-null    int32  
 2   A3_Score         704 non-null    int32  
 3   A4_Score         704 non-null    int32  
 4   A5_Score         704 non-null    int32  
 5   A6_Score         704 non-null    int32  
 6   A7_Score         704 non-null    int32  
 7   A8_Score         704 non-null    int32  
 8   A9_Score         704 non-null    int32  
 9   A10_Score        704 non-null    int32  
 10  age              702 non-null    float64
 11  gender           704 non-null    object 
 12  ethnicity        609 non-null    object 
 13  jundice          704 non-null    object 
 14  austim           704 non-null    object 
 15  contry_of_res    704 non-null    object 
 16  used_app_before  704 non-null    object 
 17  result          

In [9]:
# age column contains null values and i am filling those values with median of age
median_age = adult['age'].median()
adult['age'] = adult['age'].fillna(median_age)

In [10]:
# To check the null values are replaced and are they converted to integer or not
adult['age'].unique()

array([ 26.,  24.,  27.,  35.,  40.,  36.,  17.,  64.,  29.,  33.,  18.,
        31.,  30.,  34.,  38.,  42.,  43.,  48.,  37.,  55.,  50.,  53.,
        20.,  28.,  21., 383.,  47.,  32.,  44.,  19.,  58.,  45.,  22.,
        39.,  25.,  23.,  54.,  60.,  41.,  46.,  56.,  61.,  59.,  52.,
        49.,  51.])

In [11]:
# now i am converting all the values in age column from float to int
adult['age']=adult['age'].astype(int)

In [12]:
# To check weathter the outliers are replaced or not
adult['age'].unique()

array([ 26,  24,  27,  35,  40,  36,  17,  64,  29,  33,  18,  31,  30,
        34,  38,  42,  43,  48,  37,  55,  50,  53,  20,  28,  21, 383,
        47,  32,  44,  19,  58,  45,  22,  39,  25,  23,  54,  60,  41,
        46,  56,  61,  59,  52,  49,  51])

In [13]:
# To check weather the age column is cleaned or not
adult['age'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 704 entries, 0 to 703
Series name: age
Non-Null Count  Dtype
--------------  -----
704 non-null    int32
dtypes: int32(1)
memory usage: 2.9 KB


In [14]:
# in the ethnicity column it contains null values and i replaced those null values with others and i replaced 'Others' with 'others'
adult['ethnicity'] = adult['ethnicity'].replace({None:"others","Others":"others"})

In [15]:
adult['ethnicity'].unique()

array(['White-European', 'Latino', 'others', 'Black', 'Asian',
       'Middle Eastern ', 'Pasifika', 'South Asian', 'Hispanic',
       'Turkish'], dtype=object)

In [16]:
# Converting all the letters in the ethnicity column to lowercase
adult['ethnicity'] = adult['ethnicity'].str.strip().str.lower() 

In [17]:
# There is a space between words in ethnicity column it may cause problems while training the model. so that I am replacing that space with _
adult['ethnicity'] = adult['ethnicity'].str.replace(" ","_")

In [18]:
# To check all the values has been changed
adult['ethnicity'].unique()

array(['white-european', 'latino', 'others', 'black', 'asian',
       'middle_eastern', 'pasifika', 'south_asian', 'hispanic', 'turkish'],
      dtype=object)

In [19]:
# To check the null values are replaced or not
adult['ethnicity'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 704 entries, 0 to 703
Series name: ethnicity
Non-Null Count  Dtype 
--------------  ----- 
704 non-null    object
dtypes: object(1)
memory usage: 5.6+ KB


In [20]:
# There is a spelling mistake in the column name
adult.rename(columns={'jundice': 'jaundice'}, inplace=True)
adult.rename(columns={'austim': 'autism'}, inplace=True)

In [21]:
# There is a spelling mistake in the column name
adult.rename(columns={'contry_of_res': 'country_of_res'}, inplace=True)

In [22]:
# Converting all the letter to lower case in country_of_res column
adult['country_of_res'] = adult['country_of_res'].str.strip().str.lower() 

In [23]:
# To check the column name is changed in the dataframe or not
adult.head(2)

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,26,f,white-european,no,no,united states,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,24,m,latino,no,yes,brazil,no,5.0,18 and more,Self,NO


In [24]:
adult['country_of_res']=adult['country_of_res'].replace({'viet nam':'vietnam'})

In [25]:
# There is a space between words in country_of_res column it may cause problems while training the model. so that I am replacing that space with _
adult['country_of_res'] = adult['country_of_res'].str.replace(" ","_")

In [26]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         704 non-null    int32  
 1   A2_Score         704 non-null    int32  
 2   A3_Score         704 non-null    int32  
 3   A4_Score         704 non-null    int32  
 4   A5_Score         704 non-null    int32  
 5   A6_Score         704 non-null    int32  
 6   A7_Score         704 non-null    int32  
 7   A8_Score         704 non-null    int32  
 8   A9_Score         704 non-null    int32  
 9   A10_Score        704 non-null    int32  
 10  age              704 non-null    int32  
 11  gender           704 non-null    object 
 12  ethnicity        704 non-null    object 
 13  jaundice         704 non-null    object 
 14  autism           704 non-null    object 
 15  country_of_res   704 non-null    object 
 16  used_app_before  704 non-null    object 
 17  result          

In [27]:
for col in adult.columns:
    print(col,adult[col].unique())
    print("-"*50)

A1_Score [1 0]
--------------------------------------------------
A2_Score [1 0]
--------------------------------------------------
A3_Score [1 0]
--------------------------------------------------
A4_Score [1 0]
--------------------------------------------------
A5_Score [0 1]
--------------------------------------------------
A6_Score [0 1]
--------------------------------------------------
A7_Score [1 0]
--------------------------------------------------
A8_Score [1 0]
--------------------------------------------------
A9_Score [0 1]
--------------------------------------------------
A10_Score [0 1]
--------------------------------------------------
age [ 26  24  27  35  40  36  17  64  29  33  18  31  30  34  38  42  43  48
  37  55  50  53  20  28  21 383  47  32  44  19  58  45  22  39  25  23
  54  60  41  46  56  61  59  52  49  51]
--------------------------------------------------
gender ['f' 'm']
--------------------------------------------------
ethnicity ['white-european' 

In [28]:
# The result column is the sum of all the screening tests and the data is in float datatype so that i am converting it to integer
adult['result']=adult['result'].astype(int)

In [29]:
# To check weather the datatype is changed or not
adult['result'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 704 entries, 0 to 703
Series name: result
Non-Null Count  Dtype
--------------  -----
704 non-null    int32
dtypes: int32(1)
memory usage: 2.9 KB


In [30]:
# The age_desc column contains only single value that is of no use for training the model so that I am removing that column
adult.drop('age_desc',axis=1,inplace=True)

In [31]:
# To check weather the column is deteted or not
adult.head(2)

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,26,f,white-european,no,no,united_states,no,6,Self,NO
1,1,1,0,1,0,0,0,1,0,1,24,m,latino,no,yes,brazil,no,5,Self,NO


In [32]:
# To relation column contains none values and are replaced with 'others'
adult['relation'] = adult['relation'].replace({None:"Others"})

In [33]:
# Converting all the letters in the relation column into lower case
adult['relation'] = adult['relation'].str.strip().str.lower() 

In [34]:
adult['relation'] = adult['relation'].str.replace(" ","_")

In [35]:
adult['relation'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 704 entries, 0 to 703
Series name: relation
Non-Null Count  Dtype 
--------------  ----- 
704 non-null    object
dtypes: object(1)
memory usage: 5.6+ KB


In [36]:
# To check weather all the letters are in lowercase or not
adult['relation'].unique()

array(['self', 'parent', 'others', 'health_care_professional', 'relative'],
      dtype=object)

In [37]:
# To convert all the letters in the target variable to lower
adult['Class/ASD'] = adult['Class/ASD'].str.strip().str.lower() 

In [38]:
adult.rename(columns={'Class/ASD':'target'},inplace=True)

In [39]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A1_Score         704 non-null    int32 
 1   A2_Score         704 non-null    int32 
 2   A3_Score         704 non-null    int32 
 3   A4_Score         704 non-null    int32 
 4   A5_Score         704 non-null    int32 
 5   A6_Score         704 non-null    int32 
 6   A7_Score         704 non-null    int32 
 7   A8_Score         704 non-null    int32 
 8   A9_Score         704 non-null    int32 
 9   A10_Score        704 non-null    int32 
 10  age              704 non-null    int32 
 11  gender           704 non-null    object
 12  ethnicity        704 non-null    object
 13  jaundice         704 non-null    object
 14  autism           704 non-null    object
 15  country_of_res   704 non-null    object
 16  used_app_before  704 non-null    object
 17  result           704 non-null    in

In [40]:
for col in adult.columns:
    print(col,adult[col].unique())
    print("-"*50)

A1_Score [1 0]
--------------------------------------------------
A2_Score [1 0]
--------------------------------------------------
A3_Score [1 0]
--------------------------------------------------
A4_Score [1 0]
--------------------------------------------------
A5_Score [0 1]
--------------------------------------------------
A6_Score [0 1]
--------------------------------------------------
A7_Score [1 0]
--------------------------------------------------
A8_Score [1 0]
--------------------------------------------------
A9_Score [0 1]
--------------------------------------------------
A10_Score [0 1]
--------------------------------------------------
age [ 26  24  27  35  40  36  17  64  29  33  18  31  30  34  38  42  43  48
  37  55  50  53  20  28  21 383  47  32  44  19  58  45  22  39  25  23
  54  60  41  46  56  61  59  52  49  51]
--------------------------------------------------
gender ['f' 'm']
--------------------------------------------------
ethnicity ['white-european' 

In [41]:
adult.to_excel('Cleaned_dataset.xlsx',index=False)

In [42]:
adult1 = pd.read_excel('Cleaned_dataset.xlsx')
adult1

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,relation,target
0,1,1,1,1,0,0,1,1,0,0,26,f,white-european,no,no,united_states,no,6,self,no
1,1,1,0,1,0,0,0,1,0,1,24,m,latino,no,yes,brazil,no,5,self,no
2,1,1,0,1,1,0,1,1,1,1,27,m,latino,yes,yes,spain,no,8,parent,yes
3,1,1,0,1,0,0,1,1,0,1,35,f,white-european,no,yes,united_states,no,6,self,no
4,1,0,0,0,0,0,0,1,0,0,40,f,others,no,no,egypt,no,2,others,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0,1,0,1,1,0,1,1,1,1,25,f,white-european,no,no,russia,no,7,self,yes
700,1,0,0,0,0,0,0,1,0,1,34,m,hispanic,no,no,mexico,no,3,parent,no
701,1,0,1,1,1,0,1,1,0,1,24,f,others,no,no,russia,no,7,others,yes
702,1,0,0,1,1,0,1,0,1,1,35,m,south_asian,no,no,pakistan,no,6,self,no


In [43]:
adult1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A1_Score         704 non-null    int64 
 1   A2_Score         704 non-null    int64 
 2   A3_Score         704 non-null    int64 
 3   A4_Score         704 non-null    int64 
 4   A5_Score         704 non-null    int64 
 5   A6_Score         704 non-null    int64 
 6   A7_Score         704 non-null    int64 
 7   A8_Score         704 non-null    int64 
 8   A9_Score         704 non-null    int64 
 9   A10_Score        704 non-null    int64 
 10  age              704 non-null    int64 
 11  gender           704 non-null    object
 12  ethnicity        704 non-null    object
 13  jaundice         704 non-null    object
 14  autism           704 non-null    object
 15  country_of_res   704 non-null    object
 16  used_app_before  704 non-null    object
 17  result           704 non-null    in

In [44]:
# To find the columns which are required for label encoding
cols_to_encode = adult1.select_dtypes(include=object).columns
print(cols_to_encode)

Index(['gender', 'ethnicity', 'jaundice', 'autism', 'country_of_res',
       'used_app_before', 'relation', 'target'],
      dtype='object')


In [45]:
# Applying Label Encoding for the columns having datatype 'object' and printing what value is assigned to each one in a column
from sklearn.preprocessing import LabelEncoder
encoders = {}
# Apply label encoding and store encoders
for col in cols_to_encode:
    le = LabelEncoder()
    adult1[col] = le.fit_transform(adult1[col])
    encoders[col] = le

# Print mappings: string -> number
for col in cols_to_encode:
    print(f"\nLabel Encoding Mapping for '{col}':")
    mapping = dict(zip(encoders[col].classes_, encoders[col].transform(encoders[col].classes_)))
    for key, value in mapping.items():
        print(f"{key} --> {value}")


Label Encoding Mapping for 'gender':
f --> 0
m --> 1

Label Encoding Mapping for 'ethnicity':
asian --> 0
black --> 1
hispanic --> 2
latino --> 3
middle_eastern --> 4
others --> 5
pasifika --> 6
south_asian --> 7
turkish --> 8
white-european --> 9

Label Encoding Mapping for 'jaundice':
no --> 0
yes --> 1

Label Encoding Mapping for 'autism':
no --> 0
yes --> 1

Label Encoding Mapping for 'country_of_res':
afghanistan --> 0
americansamoa --> 1
angola --> 2
argentina --> 3
armenia --> 4
aruba --> 5
australia --> 6
austria --> 7
azerbaijan --> 8
bahamas --> 9
bangladesh --> 10
belgium --> 11
bolivia --> 12
brazil --> 13
burundi --> 14
canada --> 15
chile --> 16
china --> 17
costa_rica --> 18
cyprus --> 19
czech_republic --> 20
ecuador --> 21
egypt --> 22
ethiopia --> 23
finland --> 24
france --> 25
germany --> 26
hong_kong --> 27
iceland --> 28
india --> 29
indonesia --> 30
iran --> 31
iraq --> 32
ireland --> 33
italy --> 34
japan --> 35
jordan --> 36
kazakhstan --> 37
lebanon --> 38
ma

In [46]:
adult1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   A1_Score         704 non-null    int64
 1   A2_Score         704 non-null    int64
 2   A3_Score         704 non-null    int64
 3   A4_Score         704 non-null    int64
 4   A5_Score         704 non-null    int64
 5   A6_Score         704 non-null    int64
 6   A7_Score         704 non-null    int64
 7   A8_Score         704 non-null    int64
 8   A9_Score         704 non-null    int64
 9   A10_Score        704 non-null    int64
 10  age              704 non-null    int64
 11  gender           704 non-null    int32
 12  ethnicity        704 non-null    int32
 13  jaundice         704 non-null    int32
 14  autism           704 non-null    int32
 15  country_of_res   704 non-null    int32
 16  used_app_before  704 non-null    int32
 17  result           704 non-null    int64
 18  relation  

In [47]:
adult1.to_excel('final_preprocessed_dataset.xlsx',index=False)