In [1]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

from sklearn.model_selection import train_test_split

In [65]:
def main(row_csv, trainData_csv, testData_csv = None):
    
    data, numerical_cols, bin_cols, mul_cols = preprocessing(row_csv, trainData_csv, testData_csv)
    combined_train_test = feature_extraction(data, numerical_cols, bin_cols, mul_cols)
    return combined_train_test[0].head()

In [97]:
def preprocessing(row_csv, trainData_csv, testData_csv = None):
    """
        row_csv:           a csv file containing target column index.
        trainData_csv:     training dataset, csv file.
        testData_csv:      testing dataset, csv file.
        
    """
    
    # create dataframe
    if testData_csv:
        test_df = pd.read_csv(testData_csv)
    train_df = pd.read_csv(trainData_csv, encoding = "latin1")
    row = pd.read_csv(row_csv)
    target_index = int(row["targetIndex"])
    
    # get features
    features = list(train_df.columns)
    
    # id column
        # code here
        #
        # try to find the id which contributes little on classification
        # drop id 
        #
        
    # target column
    target = features[target_index]
    target_col = train_df[target]
    features.remove(target)
    train_df.drop([target], axis = 1, inplace = True)
    
    # combine train and test
    combined_train_test = [train_df, test_df] if testData_csv else [train_df]
    
    # replace spaces with null value
    for dataset in combined_train_test:
        for feature in features:
            dataset[feature] = dataset[feature].replace(" ", np.nan)
    
    # if too much nan value, we can drop the feature
    for dataset in combined_train_test:
        for feature in features:
            if dataset[feature].isna().sum()/dataset[feature].count() >= 3:
                dataset.drop([feature], axis = 1, inplace = True)
                features.remove(feature)
    
    # numerical columns
    numerical_cols = list(train_df.select_dtypes(exclude = "object").columns)
    numerical_cols = [x for x in numerical_cols if train_df[x].nunique() > 6]
    
    # categorical columns
    categorical_cols = [x for x in features if not x in numerical_cols]
    
    # binary columns
    bin_cols = train_df.nunique()[train_df.nunique() <= 6].keys().tolist()
    
    # multivalues columns
    mul_cols = [x for x in categorical_cols if not x in bin_cols]
    
    print("feature = ", features)
    #print("catagorical = ", categorical_cols)
    #print("numerical = ", numerical_cols)
    #print("binary = ", bin_cols)
    #print("multivalue = ", mul_cols)
    
    # fill in missing values
    for dataset in combined_train_test:
        for feature in numerical_cols:
            dataset[feature].fillna(dataset[feature].median(), inplace = True)
        
        for feature in categorical_cols:
            dataset[feature].fillna(dataset[feature].mode()[0], inplace = True)
    
    # normalization
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    for dataset in combined_train_test:
        for feature in numerical_cols:
            dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1, 1))
    
    # label encode
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    for dataset in combined_train_test:
        for feature in bin_cols:
            dataset[feature] = encoder.fit_transform(dataset[feature])
            
    # pack dataset for next step
    if testData_csv:
        data = [train_df, target_col, test_df]
    else:
        data = [train_df, target_col]
    
    return data, numerical_cols, bin_cols, mul_cols

In [98]:
main("row1.csv","train.csv","test.csv")

feature =  ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,-1.730108,2,"Braund, Mr. Owen Harris",1,-0.565736,0.432793,-0.473674,A/5 21171,-0.502445,2
1,-1.72622,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.663861,0.432793,-0.473674,PC 17599,0.786845,0
2,-1.722332,2,"Heikkinen, Miss. Laina",0,-0.258337,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,2
3,-1.718444,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.433312,0.432793,-0.473674,113803,0.42073,2
4,-1.714556,2,"Allen, Mr. William Henry",1,0.433312,-0.474545,-0.473674,373450,-0.486337,2


In [99]:
main("row2.csv","FAO.csv")

feature =  ['Area Abbreviation', 'Area Code', 'Area', 'Item Code', 'Item', 'Element', 'Unit', 'latitude', 'longitude', 'Y1961', 'Y1962', 'Y1963', 'Y1964', 'Y1965', 'Y1966', 'Y1967', 'Y1968', 'Y1969', 'Y1970', 'Y1971', 'Y1972', 'Y1973', 'Y1974', 'Y1975', 'Y1976', 'Y1977', 'Y1978', 'Y1979', 'Y1980', 'Y1981', 'Y1982', 'Y1983', 'Y1984', 'Y1985', 'Y1986', 'Y1987', 'Y1988', 'Y1989', 'Y1990', 'Y1991', 'Y1992', 'Y1993', 'Y1994', 'Y1995', 'Y1996', 'Y1997', 'Y1998', 'Y1999', 'Y2000', 'Y2001', 'Y2002', 'Y2003', 'Y2004', 'Y2005', 'Y2006', 'Y2007', 'Y2008', 'Y2009', 'Y2010', 'Y2011', 'Y2012', 'Y2013']


Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element,Unit,latitude,longitude,Y1961,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AFG,-1.694187,Afghanistan,-1.229856,Wheat and products,1,0,0.547731,0.786473,1.034979,...,0.558353,0.593188,0.626673,0.691995,0.680541,0.725881,0.713496,0.708875,0.70264,0.694641
1,AFG,-1.694187,Afghanistan,0.743697,Rice (Milled Equivalent),1,0,0.547731,0.786473,0.011582,...,-0.012069,-0.007954,0.010161,-0.009658,-0.005534,-0.019354,-0.015933,-0.012738,-0.022416,-0.024695
2,AFG,-1.694187,Afghanistan,-1.21643,Barley and products,0,0,0.547731,0.786473,-0.051171,...,-0.084834,-0.049269,-0.045282,-0.04598,-0.052951,-0.025861,-0.038186,-0.059255,-0.032006,-0.034665
3,AFG,-1.694187,Afghanistan,-1.21643,Barley and products,1,0,0.547731,0.786473,0.043252,...,-0.059235,-0.087421,-0.08784,-0.086653,-0.083589,-0.084424,-0.082866,-0.081577,-0.079792,-0.078247
4,AFG,-1.694187,Afghanistan,-1.209717,Maize and products,0,0,0.547731,0.786473,0.027417,...,-0.072337,-0.054804,-0.050943,-0.048628,-0.04985,-0.059119,-0.06219,-0.0613,-0.05962,-0.060396


In [71]:
def feature_extraction(data, numerical_cols, bin_cols, mul_cols):
    """
    
    """
    
    # unpack data
    if len(data) == 3:
        train_df, test_df = data[0], data[2]
        combined_train_test = [train_df, test_df]
        target_col = data[1]
    else:
        train_df, target_col = data[0], data[1]
        combined_train_test = [train_df]
    numerical_cols = numerical_cols
    bin_cols, mul_cols = bin_cols, mul_cols
    
    #
    
    
    
    return combined_train_test

In [19]:
#def feature_selection():
    

In [20]:
#def estimation():
    

In [21]:
#def postprocessing():
    