In [1]:
# Import lib
# ===========================================================
import csv
import pandas as pd
import numpy as np
import random
import time
import collections
import math
import sys
from tqdm import tqdm
from time import sleep

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

from datascience import *
from scipy import stats

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [2]:
# Initialize useful data
# ===========================================================
with open('clinvar_conflicting_clean.csv', 'r') as f:
    reader = csv.reader(f)
    temp_rows = list(reader)
df = pd.read_csv('clinvar_conflicting_clean.csv', low_memory=False)
columns_to_change = ['ORIGIN', 'EXON', 'INTRON', 'STRAND', 'LoFtool', 'CADD_PHRED', 'CADD_RAW', 'BLOSUM62']
df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']] = df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']].fillna(value="null")



In [None]:
# map categorical data to numerical data
# ===========================================================
def uniq_val(column):
    input_domain = set([column[i][0] for i in range(len(column))])
    return input_domain

def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

def map_categ2numer():
    for attribute in df.columns.values:
        
        if is_numeric(df[[attribute]].values[0]): continue
        values_of_this_attrib = list(uniq_val(df[[attribute]].values))
        length = len(values_of_this_attrib)
        for i in range(length):
#             print(values_of_this_attrib[i])
            df[[attribute]] = df[[attribute]].replace(values_of_this_attrib[i], i / length)

map_categ2numer()
df.head()

In [None]:
df = df.sample(n = df.shape[0])
all_rows = df.values.tolist()
row_num = len(all_rows)

In [None]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.01  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
trainingframe = df.iloc[: training_size]
testingframe = df.iloc[training_size: ]
trainingset = Table.from_df(trainingframe)
testingset = Table.from_df(testingframe)

In [None]:
# formula = 'CLASS ~ CHROM + POS + REF + ALT + AF_ESP + AF_EXAC + AF_TGP + CLNDISDB + CLNDN + CLNVC + CLNVI + MC + ORIGIN + Allele + Consequence + IMPACT + SYMBOL + Feature_type + Feature + BIOTYPE + EXON + INTRON + cDNA_position + CDS_position + Protein_position + Amino_acids + Codons + STRAND + BAM_EDIT + SIFT + PolyPhen + LoFtool + CADD_PHRED + CADD_RAW + BLOSUM62'
formula = 'CLASS ~ CHROM + POS + REF + ALT + AF_ESP + AF_EXAC + AF_TGP + CLNDISDB + CLNDN'
selected_attribute = ['CHROM',
 'POS',
 'REF',
 'ALT',
 'AF_ESP',
 'AF_EXAC',
 'AF_TGP',
 'CLNDISDB',
 'CLNDN']
# i removed 'CLNHGVS'
# formula = 'CLASS ~ CLNVC + CLNVI + MC + ORIGIN + Allele + Consequence + IMPACT + SYMBOL'
# formula = 'CLASS ~ Feature_type + Feature + BIOTYPE + EXON + INTRON + cDNA_position'
# formula = 'CLASS ~ CDS_position + Protein_position + Amino_acids + Codons + STRAND + BAM_EDIT'
# formula = 'CLASS ~ SIFT + PolyPhen + LoFtool + CADD_PHRED + CADD_RAW + BLOSUM62'
model = smf.glm(formula=formula, data=trainingframe, 
                family=sm.families.Binomial(
                link=sm.genmod.families.links.probit))
result = model.fit()
# result.summary()

In [None]:
dataframe = testingset.select(selected_attribute).to_df()
dataframe.head()

In [None]:
pred = result.predict(testingset.select(selected_attribute).to_df())
final = testingset.select('CLASS').with_column('PRE_CLASS', 1 - pred.to_numpy())

In [None]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

# Normalize grades to values between 0 and 1 for more efficient computation
normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))

# Extract Features + Labels
labels.shape =  (100,) #scikit expects this
features = normalized_range.fit_transform(features)

# Create Test/Train
features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size=0.4)

# Scikit Logistic Regression
scikit_log_reg = LogisticRegression()
scikit_log_reg.fit(features_train,labels_train)

#Score is Mean Accuracy
scikit_score = clf.score(features_test,labels_test)
print 'Scikit score: ', scikit_score