In [1]:
## import pandas 
import pandas as pd 

In [2]:
## load the csv file to a dataframe 
clean_df = pd.read_csv('mikk_clean_merged_phased1_hapmap.csv') 

In [3]:
clean_df.shape 

(11359823, 2)

In [3]:
## rename the hapmap column headers 
clean_df1 = clean_df.rename(columns = {'Unnamed: 0':'Markers', 'V1':'Haplotypes'})
clean_df1.columns 

Index(['Markers', 'Haplotypes'], dtype='object')

In [4]:
## replace custom markers with real markers 
## Adding the genotype markers created from pos+chrom 
marker_ids = []
with open('./mikk_clean_markers_ed2.txt', 'r') as file:
    markers = file.readlines()
    
for lines in markers: 
    marker_ids.append(lines[:-1]) 
print(marker_ids[0:10])

['Ol223467v1_chr1_12989', 'Ol223467v1_chr1_13217', 'Ol223467v1_chr1_13234', 'Ol223467v1_chr1_13314', 'Ol223467v1_chr1_13359', 'Ol223467v1_chr1_13361', 'Ol223467v1_chr1_13386', 'Ol223467v1_chr1_13396', 'Ol223467v1_chr1_13406', 'Ol223467v1_chr1_13425']


In [6]:
print(len(marker_ids))

11359823


In [5]:
## replace the row indeces 
clean_df1.index = marker_ids
clean_df1.shape

(11359823, 2)

In [8]:
clean_df1.head()

Unnamed: 0,Markers,Haplotypes
Ol223467v1_chr1_12989,1,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_13217,2,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_13234,3,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_13314,4,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_13359,5,1111111111111111111111111111111111111111111111...


In [6]:
## begin the filtering process 
## a function to inspect redundant rows 
def same_value(row): 
    return len(set(row)) == 1 

# create the boolean filter
same_val_mask = clean_df1['Haplotypes'].apply(same_value) 

## Select rows where all values are the same 
redundant_df = clean_df1[same_val_mask] 

## Select rows where values are different 
unique_df = clean_df1[~same_val_mask] 


In [10]:
redundant_df.shape 

(1959696, 2)

In [11]:
unique_df.shape 

(9400127, 2)

In [7]:
## drop duplicates in the same value df and add the unique values to the diff val df 
## dropping the duplicates 
sorted_redundant_df = redundant_df.drop_duplicates(subset='Haplotypes') 

## add the sorted values into the diff val df 
final_clean_df = pd.concat([unique_df, sorted_redundant_df]) 

In [13]:
sorted_redundant_df.head()

Unnamed: 0,Markers,Haplotypes
Ol223467v1_chr1_12989,1,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_104017,1282,0000000000000000000000000000000000000000000000...
Ol223467v1_chr1_654296,9983,2222222222222222222222222222222222222222222222...
Ol223467v1_chr1_4121225,83238,3333333333333333333333333333333333333333333333...
Ol223467v1_chr1_23905447,405971,4444444444444444444444444444444444444444444444...


In [None]:
## filter the redundant file from the actual file 
original_clean_df = pd.read_csv('mikk_clean_geno_marked2.csv') 

In [8]:
final_clean_df.shape

(9400132, 2)

In [13]:
final_clean_df.head()

Unnamed: 0,Markers,Haplotypes
Ol223467v1_chr1_13314,4,1111111111111111111111111111111111111111111111...
Ol223467v1_chr1_13406,9,1100000000000000110000000001000000000000000000...
Ol223467v1_chr1_13425,10,1100000000000000110000000001000000000000000000...
Ol223467v1_chr1_13479,12,1111111111111111111111111111111111111111110011...
Ol223467v1_chr1_13523,14,0000110000000001000011110000111101000000000000...


In [16]:
final_clean_df1 = final_clean_df.reset_index()

In [17]:
final_clean_df1.columns 

Index(['index', 'Markers', 'Haplotypes'], dtype='object')

In [18]:
final_clean_df1.head()

Unnamed: 0,index,Markers,Haplotypes
0,Ol223467v1_chr1_13314,4,1111111111111111111111111111111111111111111111...
1,Ol223467v1_chr1_13406,9,1100000000000000110000000001000000000000000000...
2,Ol223467v1_chr1_13425,10,1100000000000000110000000001000000000000000000...
3,Ol223467v1_chr1_13479,12,1111111111111111111111111111111111111111110011...
4,Ol223467v1_chr1_13523,14,0000110000000001000011110000111101000000000000...


In [19]:
final_clean_df1 = final_clean_df1.rename(columns={'index':'Marker_ids'})
final_clean_df1.head()

Unnamed: 0,Marker_ids,Markers,Haplotypes
0,Ol223467v1_chr1_13314,4,1111111111111111111111111111111111111111111111...
1,Ol223467v1_chr1_13406,9,1100000000000000110000000001000000000000000000...
2,Ol223467v1_chr1_13425,10,1100000000000000110000000001000000000000000000...
3,Ol223467v1_chr1_13479,12,1111111111111111111111111111111111111111110011...
4,Ol223467v1_chr1_13523,14,0000110000000001000011110000111101000000000000...


In [20]:
final_clean_df1.to_csv('mikk_clean_hapvarMaps.csv', index = None, header = True)

In [None]:
## Just use the chunk method 
# Define the file path and column names (if applicable)
file_path = './mikk_clean_geno_marked2.csv' 
chunk_size = 50000 # Adjust this according to your system's memory capacity

# Initialize an empty list to store processed chunks
processed_chunks = []

# Read the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    
    #fill empty spaces with NA
    chunk.fillna('NA', inplace=True)
    
    #filter non informative genotypes 
    chunk[chunk['Unnamed: 0'].isin(final_clean_df.index)]
                
    #rename the marker header 
    chunk.reset_index()
    chunk.rename(columns={'Unnamed: 0':'Marker_Ids'})
    
    #append the processed chunks into a list
    processed_chunks.append(chunk)
    
#save the list as a dataframe 
final_processed_df = pd.concat(processed_chunks)

print(final_processed_df.shape)
print(final_processed_df.head())

# #save it into a csv file 
# final_processed_df.to_csv('mikk_clean_geno_hapVariants.csv', index = None, Header = True)


In [11]:
## Just use the chunk method 
# Define the file path and column names (if applicable)
file_path = './mikk_clean_geno_marked2.csv' 
chunk_size = 50000 # Adjust this according to your system's memory capacity

# Initialize an empty list to store processed chunks
processed_chunks = []

# Read the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    
    #fill empty spaces with NA
    chunk.fillna('NA', inplace=True)
    
    #rename the marker header 
    chunk.reset_index()
    chunk=chunk.rename(columns={'Unnamed: 0':'Marker_Ids'})
    
    print(chunk.head())
    break

              Marker_Ids MIKK4_1 MIKK4_2 MIKK5_1 MIKK7_2 MIKK8_2 MIKK10_1  \
0  Ol223467v1_chr1_12989     1.0     1.0     1.0      NA      NA      1.0   
1  Ol223467v1_chr1_13217     1.0     1.0     1.0     1.0      NA       NA   
2  Ol223467v1_chr1_13234     1.0     1.0     1.0     1.0      NA       NA   
3  Ol223467v1_chr1_13314      NA     1.0      NA      NA      NA      1.0   
4  Ol223467v1_chr1_13359     1.0     1.0     1.0     1.0      NA      1.0   

  MIKK11_1 MIKK11_2 MIKK13_2  ... MIKK132_4_1 MIKK132_5 MIKK134_1 MIKK135_1  \
0      1.0       NA      1.0  ...          NA        NA       1.0       1.0   
1       NA      1.0      1.0  ...         1.0       1.0        NA        NA   
2       NA      1.0      1.0  ...         1.0       1.0        NA        NA   
3      1.0      1.0      1.0  ...         1.0       0.0        NA        NA   
4      1.0      1.0      1.0  ...         1.0       1.0       1.0       1.0   

  MIKK135_2 MIKK137_4 MIKK138_1 MIKK139_4 MIKK140_1 MIKK140_3 

In [None]:
## Just use the chunk method 
# Define the file path and column names (if applicable)
file_path = './mikk_clean_geno_marked2.csv' 
chunk_size = 50000 # Adjust this according to your system's memory capacity

# Initialize an empty list to store processed chunks
processed_chunks = []

# Read the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    
    #fill empty spaces with NA
    chunk.fillna('NA', inplace=True)
    
    #rename the marker header 
    chunk.reset_index()
    chunk=chunk.rename(columns={'Unnamed: 0':'Marker_Ids'})
    
    #filter non informative genotypes 
    chunk=chunk[chunk['Marker_Ids'].isin(final_clean_df.index)]
    
     #append the processed chunks into a list
    processed_chunks.append(chunk)
    
#save the list as a dataframe 
final_processed_df = pd.concat(processed_chunks)

print(final_processed_df.shape)
print(final_processed_df.head())

    
    # print(chunk.head())
    # break