In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("language.csv")
df.head()

Unnamed: 0,Y,filename,sex,age,age_years,corpus,group,child_TNW,child_TNS,examiner_TNW,...,word_errors,f_k,n_v,n_aux,n_3s_v,det_n_pl,det_pl_n,pro_aux,pro_3s_v,total_error
0,1,fssli009.cha,,165,13.75,Conti4,SLI,287,36,4,...,8,1.210456,0,2,2,7,0,0,1,12
1,1,fssli058.cha,,172,14.333333,Conti4,SLI,368,42,27,...,16,1.871708,0,4,0,5,0,0,0,9
2,1,fssli062.cha,,160,13.333333,Conti4,SLI,266,26,2,...,0,2.240602,0,1,0,5,0,0,0,6
3,1,fssli066.cha,,184,15.333333,Conti4,SLI,405,40,21,...,4,1.877762,1,0,0,11,0,0,0,12
4,1,fssli108.cha,,176,14.666667,Conti4,SLI,300,35,20,...,8,0.339524,0,1,1,5,0,0,0,7


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 64 columns):
Y                        1163 non-null int64
filename                 1163 non-null object
sex                      1044 non-null object
age                      1163 non-null int64
age_years                1163 non-null float64
corpus                   1163 non-null object
group                    1163 non-null object
child_TNW                1163 non-null int64
child_TNS                1163 non-null int64
examiner_TNW             1163 non-null int64
freq_ttr                 1163 non-null float64
r_2_i_verbs              1163 non-null float64
mor_words                1163 non-null int64
num_pos_tags             1163 non-null int64
n_dos                    1163 non-null int64
repetition               1163 non-null int64
retracing                1163 non-null int64
fillers                  1163 non-null int64
s_1g_ppl                 1163 non-null float64
s_2g_ppl                 1

## Data Cleaning

In [5]:
target = 'Y'

### Replace empty or errors by np.nan

In [6]:
# replace empty/erros by np.nan
df = df.replace(r'^\s+$', np.nan, regex=True)
df = df.replace('\t','',regex=True)
df = df.replace(' ','',regex=True)
df = df.replace('\?',np.nan,regex=True)
df = df.replace('\<',np.nan,regex=True)
df = df.replace('#NULL!',np.nan,regex=True)
#df = df.replace('99',np.nan,regex=True)
#df = df.replace(99,np.nan,regex=True)
#df

### Remove outliers

In [7]:
# Drop the column outliers as it is unrelated to the dependent variable
outliers = ['filename','corpus','group']
df = df.drop(outliers,axis=1)
df.head()

Unnamed: 0,Y,sex,age,age_years,child_TNW,child_TNS,examiner_TNW,freq_ttr,r_2_i_verbs,mor_words,...,word_errors,f_k,n_v,n_aux,n_3s_v,det_n_pl,det_pl_n,pro_aux,pro_3s_v,total_error
0,1,,165,13.75,287,36,4,0.333,0.108108,252,...,8,1.210456,0,2,2,7,0,0,1,12
1,1,,172,14.333333,368,42,27,0.274,0.05,361,...,16,1.871708,0,4,0,5,0,0,0,9
2,1,,160,13.333333,266,26,2,0.411,0.105263,246,...,0,2.240602,0,1,0,5,0,0,0,6
3,1,,184,15.333333,405,40,21,0.359,0.148936,348,...,4,1.877762,1,0,0,11,0,0,0,12
4,1,,176,14.666667,300,35,20,0.279,0.15,294,...,8,0.339524,0,1,1,5,0,0,0,7


### Remove bad columns

In [8]:
def remove_bad_columns(df,bad_column_threshold):
    # find bad columns having too many missing values
    n_null = np.array(df.isnull().sum(axis=0))
    bad_col = np.array([]).astype(int)
    for i in range(len(n_null)):
        if n_null[i] >= bad_column_threshold:
            bad_col = np.append(bad_col,i)

    #print(bad_col)
    print('number of bad columns:',len(bad_col))

    # delete bad columns
    df = df.drop(df.columns[bad_col],axis=1)
    #df.info()
    return df  

In [9]:
df = remove_bad_columns(df,3)
df.info()

number of bad columns: 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 60 columns):
Y                        1163 non-null int64
age                      1163 non-null int64
age_years                1163 non-null float64
child_TNW                1163 non-null int64
child_TNS                1163 non-null int64
examiner_TNW             1163 non-null int64
freq_ttr                 1163 non-null float64
r_2_i_verbs              1163 non-null float64
mor_words                1163 non-null int64
num_pos_tags             1163 non-null int64
n_dos                    1163 non-null int64
repetition               1163 non-null int64
retracing                1163 non-null int64
fillers                  1163 non-null int64
s_1g_ppl                 1163 non-null float64
s_2g_ppl                 1163 non-null float64
s_3g_ppl                 1163 non-null float64
d_1g_ppl                 1163 non-null float64
d_2g_ppl                 1163 non-null float

### Remove bad rows

In [10]:
 # Find rows where target is missing
def find_missing_target_rows(df,target):
    # find rows where target is missing
    missing_row = df[target].isnull()
    print('Number of rows where target are missing:')
    print(sum(missing_row))

    #df = df[~missing_row]
    missing_row_indices = np.array([t for t in range(df.shape[0]) if missing_row[t]])
    
    return missing_row_indices

In [11]:
missing_target_rows = find_missing_target_rows(df,target)

Number of rows where target are missing:
0


We find bad rows which contain too many missing values, then remove them.

In [12]:
def find_bad_rows(df,bad_row_threshold):   
    # find bad rows having too many missing values
    n_null = np.array(df.isnull().sum(axis=1))
    bad_row = np.array([]).astype(int)
    for t in range(len(n_null)):
        if n_null[t] >= bad_row_threshold:
            bad_row = np.append(bad_row,t)

    #print(bad_row)
    print('number of bad rows:',len(bad_row))

    # delete bad rows
    #df = df.drop(bad_row)
    #df.info()
    return bad_row

In [13]:
bad_rows = find_bad_rows(df,3)
df.info()

number of bad rows: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 60 columns):
Y                        1163 non-null int64
age                      1163 non-null int64
age_years                1163 non-null float64
child_TNW                1163 non-null int64
child_TNS                1163 non-null int64
examiner_TNW             1163 non-null int64
freq_ttr                 1163 non-null float64
r_2_i_verbs              1163 non-null float64
mor_words                1163 non-null int64
num_pos_tags             1163 non-null int64
n_dos                    1163 non-null int64
repetition               1163 non-null int64
retracing                1163 non-null int64
fillers                  1163 non-null int64
s_1g_ppl                 1163 non-null float64
s_2g_ppl                 1163 non-null float64
s_3g_ppl                 1163 non-null float64
d_1g_ppl                 1163 non-null float64
d_2g_ppl                 1163 non-null float64


In [14]:
del_rows = np.union1d(missing_target_rows,bad_rows)
print('number of rows need to delete:',len(del_rows))

df = df.drop(del_rows)

number of rows need to delete: 0


### Separate target and attributes

In [15]:
dfx = df.drop(target,axis=1)
dfy = df[target]

### Find variable type

In [16]:
# number of uniques of each column (excluding NaN)
nu = np.array([len(pd.unique(dfx[col].dropna())) for col in dfx.columns])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 129  129  611  133   99  396  643  578   51   13   59   48   52 1160
 1161 1162 1158 1161 1156  970  970   13   13  643  643  133  133  664
 1095  970  978   93  592  628   81   56   16   17   30  103   12   43
  129   44   58   60   51   11   17   13 1161   16   24   25   32    3
    3   22   61]


In [17]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 5: # !!!! NOTE: this is not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))  # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [18]:
variable_type = define_variable_type(dfx,nu)
print('variable type:',variable_type)

i_binary: []
i_category: [55, 56]
variable type: [3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3. 3. 2. 2. 3. 3.]


### Impute missing values of attributes

In [19]:
def impute_missing(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            df2[col] = df[col].fillna(df[col].median())    
    return df2       

In [20]:
dfx_imputed = impute_missing(dfx,variable_type)
dfx_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1163 entries, 0 to 1162
Data columns (total 59 columns):
age                      1163 non-null int64
age_years                1163 non-null float64
child_TNW                1163 non-null int64
child_TNS                1163 non-null int64
examiner_TNW             1163 non-null int64
freq_ttr                 1163 non-null float64
r_2_i_verbs              1163 non-null float64
mor_words                1163 non-null int64
num_pos_tags             1163 non-null int64
n_dos                    1163 non-null int64
repetition               1163 non-null int64
retracing                1163 non-null int64
fillers                  1163 non-null int64
s_1g_ppl                 1163 non-null float64
s_2g_ppl                 1163 non-null float64
s_3g_ppl                 1163 non-null float64
d_1g_ppl                 1163 non-null float64
d_2g_ppl                 1163 non-null float64
d_3g_ppl                 1163 non-null float64
z_mlu_sli           

## Data Processing

### Attributes

In [21]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [22]:
# convert x
x = np.array(dfx_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

(1163, 63)
[[165.          13.75       287.         ...   0.           1.
   12.        ]
 [172.          14.33333333 368.         ...   0.           0.
    9.        ]
 [160.          13.33333333 266.         ...   0.           0.
    6.        ]
 ...
 [119.           9.91666667 337.         ...   0.           4.
    9.        ]
 [112.           9.33333333 511.         ...   0.           5.
   15.        ]
 [108.           9.         495.         ...   0.           4.
   12.        ]]


### Target

In [23]:
y = np.array(dfy)
#print(np.unique(y,return_counts=True))

# convert taget to 0 and 1
y_new = y
#y_new = np.ones(y.shape[0])
#y_new[y =='No'] = 0

print(np.unique(y_new,return_counts=True))

(array([0, 1]), array([896, 267]))


In [24]:
# combine X and y and save to a file
xy_new = np.hstack((x_new,y_new[:,np.newaxis]))
np.savetxt('data_processed.dat',xy_new,fmt='%f')