In [3]:
import pandas as pd 
import numpy as np
import pickle

In [5]:
'''
NAME: loadIntensity
DESCRIPTION: Take in a .csv file with all the intensity ratio values (same type)
and return a matrix.
INPUT: fileloc, delimiter, skiprows, dtype
OUTPUT: matrix
'''
def loadIntensity(filename, delimiter=',', skiprows=1, dtype=float):
    def iter_func():
        with open(filename, 'r') as infile:
            for _ in range(skiprows):
                next(infile)
            for line in infile:
                line = line.rstrip().split(delimiter)
                for item in line:
                    if item == "":
                        yield 0
                    else:
                        yield dtype(item)
        loadIntensity.rowlength = len(line)

    data = np.fromiter(iter_func(), dtype=dtype)
    data = data.reshape((-1, loadIntensity.rowlength))
    return data

'''
NAME: matrixToDF
DESCRIPTION: Given the raw matrix, clean and label to return the final dataframe
INPUT: matrix
OUTPUT: dataframe
'''

def matrixToDF(rawMatrix):
    ''' load as matrix '''
    transMatrix = rawMatrix.transpose()

    ''' change rawData matrix to dataFrame '''
    rawDF = pd.DataFrame(transMatrix)
    # drop rows '0' and '1'
    rawDF = rawDF.drop(rawDF.index[[0,1]])

    ''' read single csv to get column to SNP indexes'''
    tempFile = "C:/Users/Charity Faith/Desktop/Data/GenotypeFiles/example_file.csv"
    tempDF = pd.read_csv(open(tempFile), skiprows=10)
    colNames = tempDF['SNP Name'].values.tolist()

    # rename columns to SNP names-- indexes[1:620901] are in order anyway
    rawDF.columns = colNames
    return(rawDF)

rawDF = matrixToDF(loadIntensity("C:/Users/Charity Faith/Desktop/Data/GenotypeFiles/intensityOnly.csv"))
rawDF.head()

Unnamed: 0,200003,200006,200047,200050,200052,200053,200070,200078,200087,200091,...,rs9999853,rs999986,rs9999883,rs9999929,rs9999931,rs9999944,rs999995,rs9999963,rs9999966,rs9999979
2,0.1104,0.0415,0.2462,-0.0202,0.0836,0.019,0.0798,0.04,-0.1211,-0.0249,...,-0.0954,-0.0589,0.1433,-0.1803,0.0032,-0.0197,-0.0141,0.0246,-0.0998,-0.1488
3,0.0282,0.0169,0.137,0.024,0.0813,0.1355,0.1739,-0.055,-0.0413,-0.0933,...,0.0139,0.1347,0.0669,-0.0491,-0.0861,-0.0258,0.1026,0.0656,0.052,-0.2991
4,-0.1356,-0.0233,-0.0686,-0.0722,0.0553,-0.0539,-0.0362,-0.093,-0.1916,-0.0444,...,0.0378,-0.067,0.0994,0.0015,0.0971,-0.0898,-0.0734,0.0709,-0.0677,0.0083
5,0.0157,-0.0046,0.0904,0.0544,0.0102,0.0087,-0.0518,-0.0507,-0.159,0.0179,...,0.0682,-0.0112,0.094,-0.2411,-0.1718,0.0624,0.0506,0.1439,-0.0408,-0.1801
6,0.0385,0.0176,-0.0511,0.0469,-0.0003,0.0103,-0.1398,0.0324,-0.0602,0.0094,...,-0.084,-0.0433,0.1003,-0.0525,0.0541,-0.0147,0.028,0.0142,-0.1009,0.1672


In [6]:
# merge intensities and phenotype data
finalDF = rawDF
finalPhe = pd.read_csv(open("C:/Users/Charity Faith/Desktop/Data/GenotypeFiles/phenotypeOnly.csv"))
finalDF['Subject'] = finalPhe['ReportInd'].tolist() 
finalDF['Case/Control'] = finalPhe['Case/Control'].tolist()
finalDF.set_index(keys='Subject', inplace=True)
finalDF.head()

Unnamed: 0_level_0,200003,200006,200047,200050,200052,200053,200070,200078,200087,200091,...,rs999986,rs9999883,rs9999929,rs9999931,rs9999944,rs999995,rs9999963,rs9999966,rs9999979,Case/Control
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.1104,0.0415,0.2462,-0.0202,0.0836,0.019,0.0798,0.04,-0.1211,-0.0249,...,-0.0589,0.1433,-0.1803,0.0032,-0.0197,-0.0141,0.0246,-0.0998,-0.1488,3
3,0.0282,0.0169,0.137,0.024,0.0813,0.1355,0.1739,-0.055,-0.0413,-0.0933,...,0.1347,0.0669,-0.0491,-0.0861,-0.0258,0.1026,0.0656,0.052,-0.2991,2
5,-0.1356,-0.0233,-0.0686,-0.0722,0.0553,-0.0539,-0.0362,-0.093,-0.1916,-0.0444,...,-0.067,0.0994,0.0015,0.0971,-0.0898,-0.0734,0.0709,-0.0677,0.0083,2
7,0.0157,-0.0046,0.0904,0.0544,0.0102,0.0087,-0.0518,-0.0507,-0.159,0.0179,...,-0.0112,0.094,-0.2411,-0.1718,0.0624,0.0506,0.1439,-0.0408,-0.1801,2
8,0.0385,0.0176,-0.0511,0.0469,-0.0003,0.0103,-0.1398,0.0324,-0.0602,0.0094,...,-0.0433,0.1003,-0.0525,0.0541,-0.0147,0.028,0.0142,-0.1009,0.1672,2


In [7]:
file_Name = "C:/Users/Charity Faith/Desktop/Data/_01_finalDF"
fileObject = open(file_Name,'wb') 
pickle.dump(finalDF,fileObject)   
fileObject.close()