In [3]:
# Imports
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.metrics import mean_squared_error

In [4]:
# List of binary fields (either Y/N or X/blank)
yes_no =  ['COLLECT1', 'VETERANS', 'BIBLE', 'CATLG', 'HOMEE', 'PETS', 'CDPLAY', 'STEREO', 'PCOWNERS', 'PHOTO', 'CRAFTS',
        'GARDENIN', 'BOATS', 'WALKER', 'KIDSTUFF', 'CARDS', 'PLATES']
x_blank = ['NOEXCH', 'RECINHSE', 'RECP3', 'RECPGVG', 'RECSWEEP', 'MAJOR', 'PEPSTRFL']

# List of numerical fields (non-nominal)
numerical = ['AGE', 'NUMCHLD', 'INCOME', 'MBCRAFT', 'MBGARDEN', 'MBBOOKS', 'MBCOLECT', 'MAGFAML', 'MAGFEM', 'MAGMALE', 
             'PUBGARDN', 'PUBCULIN', 'PUBHLTH', 'PUBDOITY', 'PUBNEWFN', 'PUBPHOTO', 'PUBOPP', 'MALEMILI', 'MALEVET', 
             'VIETVETS', 'WWIIVETS', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C1', 'POP90C2', 
             'POP90C3', 'POP90C4', 'POP90C5', 'ETH1', 'ETH2', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 
             'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 
             'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 
             'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN3',
             'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW1', 'DW2', 'DW3', 'DW4', 'DW5', 
             'DW6', 'DW7', 'DW8', 'DW9', 'HV1', 'HV2', 'HV3', 'HV4', 'HU1', 'HU2', 'HU3', 'HU4', 'HU5', 'HHD1', 'HHD2', 
             'HHD3', 'HHD4', 'HHD5', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC2', 'ETHC3', 
             'ETHC4', 'ETHC5', 'ETHC6', 'HVP1', 'HVP2', 'HVP3', 'HVP4', 'HVP5', 'HVP6', 'HUR1', 'HUR2', 'RHP1', 'RHP2', 
             'RHP3', 'RHP4', 'HUPA1', 'HUPA2', 'HUPA3', 'HUPA4', 'HUPA5', 'HUPA6', 'HUPA7', 'RP1', 'RP2', 'RP3', 'RP4', 
             'MSA', 'ADI', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC6', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 
             'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'HHAS1', 'HHAS2', 
             'HHAS3', 'HHAS4', 'MC1', 'MC2', 'MC3', 'TPE1', 'TPE2', 'TPE3', 'TPE4', 'TPE5', 'TPE6', 'TPE7', 'TPE8', 'TPE9', 
             'PEC1', 'PEC2', 'TPE10', 'TPE11', 'TPE12', 'TPE13', 'LFC1', 'LFC2', 'LFC3', 'LFC4', 'LFC5', 'LFC6', 'LFC7', 
             'LFC8', 'LFC9', 'LFC10', 'OCC1', 'OCC2', 'OCC3', 'OCC4', 'OCC5', 'OCC6', 'OCC7', 'OCC8', 'OCC9', 'OCC10', 
             'OCC11', 'OCC12', 'OCC13', 'EIC1', 'EIC2', 'EIC3', 'EIC4', 'EIC5', 'EIC6', 'EIC7', 'EIC8', 'EIC9', 'EIC10', 
             'EIC11', 'EIC12', 'EIC13', 'EIC14', 'EIC15', 'EIC16', 'OEDC1', 'OEDC2', 'OEDC3', 'EC1', 'EC2', 'EC3', 'EC4', 
             'EC5', 'EC6', 'EC7', 'EC8', 'SEC1', 'SEC2', 'SEC3', 'SEC4', 'SEC5', 'AFC1', 'AFC2', 'AFC3', 'AFC4', 'AFC5', 
             'AFC6', 'VC1', 'VC2', 'VC3', 'VC4', 'ANC1', 'ANC2', 'ANC3', 'ANC4', 'ANC5', 'ANC6', 'ANC7', 'ANC8', 'ANC9', 
             'ANC10', 'ANC11', 'ANC12', 'ANC13', 'ANC14', 'ANC15', 'POBC1', 'POBC2', 'LSC1', 'LSC2', 'LSC3', 'LSC4', 'VOC1', 
             'VOC2', 'VOC3', 'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6', 'HC7', 'HC8', 'HC9', 'HC10', 'HC11', 'HC12', 'HC13', 
             'HC14', 'HC15', 'HC16', 'HC17', 'HC18', 'HC19', 'HC20', 'HC21', 'MHUC1', 'MHUC2', 'AC1', 'AC2', 'CARDPROM', 
             'NUMPROM', 'CARDPM12', 'NUMPRM12', 'RAMNT_3', 'RAMNT_4', 'RAMNT_5', 'RAMNT_6', 'RAMNT_7', 'RAMNT_8', 'RAMNT_9', 
             'RAMNT_10', 'RAMNT_11', 'RAMNT_12', 'RAMNT_13', 'RAMNT_14', 'RAMNT_15', 'RAMNT_16', 'RAMNT_17', 'RAMNT_18', 
             'RAMNT_19', 'RAMNT_20', 'RAMNT_21', 'RAMNT_22', 'RAMNT_23', 'RAMNT_24', 'RAMNTALL', 'NGIFTALL', 'CARDGIFT', 
             'MINRAMNT', 'MINRDATE', 'MAXRAMNT', 'MAXRDATE', 'LASTGIFT', 'LASTDATE', 'FISTDATE', 'NEXTDATE', 'TIMELAG', 
             'AVGGIFT']

In [5]:
# Import train and test datasets
train_data = pd.read_csv('train.dat.bz2', dtype = str)
test_data = pd.read_csv('test.dat.bz2', dtype = str)
target = train_data['TARGET']

In [6]:
# Pre-processing function
def preprocess(df):
    
    for col in df:
        # If binary column, replace with 1s and 0s
        if col in yes_no:
            df[col] = df[col].replace('Y', 1)
            df[col] = df[col].replace('N', 0)
        elif col in x_blank:
            df[col] = df[col].replace('X', 1)
        # Delete column if not binary/numerical 
        elif col not in numerical:
            df = df.drop(col, axis=1)

    # Replace empty datapoints with 0, convert df to float64 datatype
    df = df.fillna(0)
    df = df.replace('', 0)
    df = df.replace(' ', 0)
    df = df.astype('float64')
    
    return df

In [7]:
# Preprocess train and test data
train_data = preprocess(train_data)
test_data = preprocess(test_data)

In [24]:
# Testing with splitting off training data
lines = 3000
X_train = train_data[:-1*lines]
X_test = train_data.iloc[-1*lines:]
Y_train = target[:-1*lines]
Y_test = target[-1*lines:]

In [33]:
# Principle Component Analysis (PCA)
def pca(components):
    pca = PCA(n_components=components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.fit_transform(X_test)
    return [X_train_pca, X_test_pca]

# Singular Value Decomposition (SVD)
def svd(components):
    svd = TruncatedSVD(n_components=components)
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.fit_transform(X_test)
    return [X_train_svd, X_test_svd]

# Random Projection (RP)
def rp(components):
    rp = GaussianRandomProjection(n_components=components, random_state=1)
    X_train_rp = rp.fit_transform(X_train)
    X_test_rp = rp.fit_transform(X_test)
    return [X_train_rp, X_test_rp]

# Locally Linear Embedding (LLE)
def lle(components):
    lle = LocallyLinearEmbedding(n_components=components)
    X_train_lle = lle.fit_transform(X_train)
    X_test_lle = lle.fit_transform(X_test)
    return [X_train_lle, X_test_lle]

In [40]:
# Testing different functions
errors = []
for f in [pca, svd, rp]:
    for c in range(1, 31):

        # Modified train and test set through function
        X_train_mod, X_test_mod = f(c)[0], f(c)[1]

        # Linear Regression Model
        lr = LinearRegression()
        lr.fit(X_train_mod, Y_train)

        # Calculate RMSE
        y_pred = lr.predict(X_test_mod)
        mse = mean_squared_error(Y_test, y_pred)
        rmse = np.sqrt(mse)
        
        # Append entry and print for progress tracking
        entry = [f.__name__.upper(), c, rmse]
        errors.append(entry)
        print(entry, end='\r')

# Sort 10 lowest RMSEs
errors.sort(key=lambda x:x[2])
for i in range(10):
    print(errors[i])
    
# ['RP', 1, 3.867014412147527]
# ['SVD', 2, 3.8683395276250416]
# ['SVD', 3, 3.8688815439123228]
# ['SVD', 4, 3.8695741164399187]
# ['SVD', 1, 3.8702366446089678]
# ['RP', 2, 3.870262165310155]
# ['PCA', 1, 3.8702703017445157]
# ['RP', 3, 3.8704023707837023]
# ['PCA', 2, 3.870423361217469]
# ['PCA', 3, 3.8704633041350167]

['RP', 1, 3.867014412147527]]]]
['SVD', 2, 3.8683395276250416]
['SVD', 3, 3.8688815439123228]
['SVD', 4, 3.8695741164399187]
['SVD', 1, 3.8702366446089678]
['RP', 2, 3.870262165310155]
['PCA', 1, 3.8702703017445153]
['RP', 3, 3.8704023707837023]
['PCA', 2, 3.8704233612174694]
['PCA', 3, 3.870463304135017]


In [13]:
# Implement method
method = PCA(n_components=2)
train_mod = method.fit_transform(train_data)
test_mod = method.fit_transform(test_data)

# Linear regression
model = LinearRegression().fit(train_mod, target)
results = model.predict(test_mod)

In [14]:
# Write to prediction.txt file
with open('prediction.txt', 'w') as f:
    for result in results:
        f.write(f"{result}\n")