In [6]:
import time
import numpy as np
import random
from sklearn import linear_model
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [7]:
#Function to load the data file to memeory.

#Input: File path to read.
#Output: A 2d numpy array with all loaded samples from the file to read in string.

def parseFile(file):
    time_start = time.time()

    content = []
    count, count_incomplete,count_complete, count_part = 0, 0, 0, 0

    with open(file) as txtfile:
        for row in txtfile:

            row = row.split(',')
            row[-1] = row[-1].strip()

            content.append(row)

            count += 1

    content_mat = np.array(content)

    time_end = time.time()
    print('Reading data is complete! Running time is ' + str(time_end - time_start) + 's!')

    return content_mat

In [8]:
#Function to filter the samples with no missing values. 
#Input: mat - 2d Numpy Array.
#Onput: mat - 2d Numpy Array with all samples that have no Missing values.

def filter_full_feature(mat):
    row_count = 0
    full_list = []
    for row in mat:
        if 'N/A' in row or 'NA' in row:
            pass
        else:
            full_list.append(row_count)

        row_count += 1
    print('There are a total of ' + str(len(full_list)) + ' samples fed into the model')
    mat = mat[full_list, :]
    return mat

#Function to split up the full dataset into a training and testing set in 80:20 ratio.
#Input: mat - A 2d Numpy array
#Output: train_mat, test_mat - 2 deperate Numpy arrays each as a subset of the full mat set, 
#        with a rough ratio of 80:20

def train_test_split(mat):
    train_list = []
    test_list = []
    num_sample, num_var = mat.shape

    for i in range(0, num_sample):
        if i == 0:
            train_list.append(i)
            test_list.append(i)
        else:
            rand = random.random()
            if rand >= 0.2:
                train_list.append(i)
            else:
                test_list.append(i)

    train_mat = mat[train_list, :]
    test_mat = mat[test_list, :]

    return train_mat, test_mat

In [15]:
def model_train(mat, label_location):
    #model = linear_model.LogisticRegression()
    num_sam, num_var = mat.shape
    model = ensemble.RandomForestClassifier(n_estimators = 15,min_samples_split= 30, min_samples_leaf = 18)
    feature_mat = np.delete(mat, label_location, axis=1)[1:, :].astype(np.float)
    feature_mat = np.concatenate((feature_mat, (feature_mat[:, 9] * feature_mat[:, 9]).reshape((num_sam-1, 1))), axis=1)
    labels = mat[1:, label_location].astype(np.int)
    print('Model training - Started!')
    time_start = time.time()
    model.fit(feature_mat, labels)
    time_end = time.time()
    print('Model training - Completed! Training time: ' + str(time_end - time_start) + 's')

    predicted_lab = model.predict(feature_mat)
    corrected_pred = np.sum(labels == predicted_lab)

    training_error = 1 - corrected_pred/labels.size

    return model, training_error


def model_test(model, mat, label_location):
    num_sam, num_var = mat.shape
    feature_mat = np.delete(mat, label_location, axis=1)[1:, :].astype(np.float)
    feature_mat = np.concatenate((feature_mat, (feature_mat[:, 9] * feature_mat[:, 9]).reshape((num_sam-1, 1))), axis=1)
    labels = mat[1:, label_location].astype(np.int)

    predicted_lab = model.predict(feature_mat)
    corrected_pred = np.sum(labels == predicted_lab)
    
    label_score = model.predict_proba(feature_mat)
    
    print(roc_auc_score(labels, label_score[:, 1]))
    
    np.savetxt('predicted_lab_RF.txt', predicted_lab.astype(np.int))
    np.savetxt('label_test_RF.txt', labels.astype(np.int))

    test_error = 1 - corrected_pred / labels.size
    return test_error

def model_sim(model, mat):
    feature_mat = mat.astype(np.float)
    num_sim, num_var = feature_mat.shape
    feature_mat = np.concatenate((feature_mat, (feature_mat[:, 9] * feature_mat[:, 9]).reshape((num_sim, 1))), axis=1)
    predicted_lab = model.predict(feature_mat).reshape(num_sim, 1)
    
    full_mat = np.concatenate((feature_mat, predicted_lab), axis=1)
    
    return full_mat

In [16]:
def main():
    content_mat = parseFile('CleanedData/gallup_clean_NA_No_D_coor.txt')
    #content_mat = parseFile('CleanedData/gallup_clean_NA_determinant.txt')
    #content_mat = parseFile('CleanedData/gallup_mean_filled_cleaned.txt')
    
    sim_mat = parseFile('sim_out.txt')
    
    num_sample, num_var = content_mat.shape
    
    #sentiment_list = [0, 1, 2, 3, 21, 22, 32] #sentiment & label(21)

    sentiment_list = [0, 1, 2, 3, 21, 22, 32, 29, 6, 47, 48, num_var -2,  num_var-1] #sentiment & background & label(21)
    new_sen_list = np.sort(sentiment_list)
    sentiment_idx_list = np.searchsorted(new_sen_list, sentiment_list)
    
    full_list = list(range(0, num_var))
    delete_list = list(set(full_list) - set(sentiment_list))

    content_mat = np.delete(content_mat, delete_list, axis=1)
        
    content_mat = content_mat[:-200000, :]
    
    content_mat = filter_full_feature(content_mat)
    content_mat = content_mat[:, sentiment_idx_list]
    
    content_mat = np.concatenate((content_mat[:, 0:7] , content_mat[:, -2:num_var + 1] , content_mat[:, 7:11]), axis=1)
    #content_mat = np.concatenate((content_mat, content_mat[:, 10].astype(np.int) * content_mat[:, 10].astype(np.int)), axis=1)
    
    train_mat, test_mat = train_test_split(content_mat)

    model, train_error = model_train(train_mat, 4)

    test_error = model_test(model, test_mat, 4)
    
    sim_result = model_sim(model, sim_mat)

    print('The training error for this trail is: ' + str(train_error))
    print('The testing error for this trail is: ' + str(test_error))
    
    np.savetxt('sim_result.txt', sim_result)


if __name__ == "__main__":
    main()

Reading data is complete! Running time is 64.16577959060669s!
Reading data is complete! Running time is 0.2495899200439453s!
There are a total of 1010924 samples fed into the model
Model training - Started!
Model training - Completed! Training time: 40.87759041786194s
0.8118732270275417
The training error for this trail is: 0.2245326513348932
The testing error for this trail is: 0.25046584848679565
