In [24]:
# Import csv as Pandas DataFrame
import pandas as pd
snps = pd.read_csv('SNPRaw.csv', header = 0)

# Print shape of snps
snps.shape

(1638, 9)

In [25]:
# Remove empty values from snps
snps = snps[snps['129S1/SvImJ'] != ' ']
snps = snps[snps['C57BL/6ByJ'] != ' ']
snps = snps.dropna()

# Print shape of snps
snps.shape

(1582, 9)

In [61]:
# Make new dataframe of shared SNPs between 129 and B6
eq = snps[snps['129S1/SvImJ'] == snps['C57BL/6ByJ']]
print(eq.shape)

# Make new dataframe of different SNPs between 129 and B6
dif = snps[snps['129S1/SvImJ'] != snps['C57BL/6ByJ']]
print(dif.shape)

(540, 9)
(1042, 9)


In [105]:
# Remove commas from strings of SNP locations
for i in range(len(dif['129S1/SvImJ'])):
    dif.iloc[i,2].replace(',','')

# Drop a pesky SNP that isn't in Build 30 like the rest of the data
dif = dif.drop(dif.index[360])

# Convert remaining locations to integers
dif['New Chr pos (bp) Build 30'] = pd.to_numeric(dif['New Chr pos (bp) Build 30'],
                                                 errors = 'coerce')

# Check the first value to make sure that worked
print(dif.iloc[0,2])
type(dif.iloc[0,2])

In [121]:
# Assign the distances between consecutive SNPs to a list then assign 
# the location of the first SNP to a list
dist = []
chrm = []
for i in range(len(dif['129S1/SvImJ'])-1):
    # This if statement makes sure only locations on the same chromosome
    # are included
    if dif.iloc[i,2] < dif.iloc[i+1,2]:
        chrm.append(dif.iloc[i,0])
        dist.append(dif.iloc[i+1,2] - dif.iloc[i,2])

In [120]:
# Turn the lists into a pandas dataframe
pdpairs = pd.DataFrame({'Location': chrm, 'Distance':dist})

# Fix the columns so that location comes first
cols = pdpairs.columns.tolist()
cols = cols[-1:] + cols[:-1]
print(cols)
pdpairs = pdpairs[cols]
pdpairs.head()

['Location', 'Distance']


Unnamed: 0,Location,Distance
0,01-005230167-M,1932787
1,01-007166135-M,937646
2,01-008110094-M,939494
3,01-009072542-M,982088
4,01-010053195-M,1955731


In [124]:
# Sort pdpairs by most distant SNPs which differ between 129 and B6
pdpairs_sorted = pdpairs.sort_values('Distance', ascending = False)
pdpairs_sorted.head()

Unnamed: 0,Location,Distance
600,10-053898997-M,37723099
997,X-008280846-M,27082198
129,02-078062303-M,23007775
1007,X-068494838-M,20876071
1017,X-123551831-M,18397467


In [125]:
dif.head

<bound method NDFrame.head of      JAX SNP ID (chromosome-position-source)       RS #  \
3                             01-005230167-M  rs3708040   
5                             01-007166135-M  rs3667401   
6                             01-008110094-M  rs3684358   
7                             01-009072542-M  rs3714728   
8                             01-010053195-M  rs3706453   
10                            01-012008926-M  rs3664960   
11                            01-013004153-M  rs3661835   
13                            01-016125198-M  rs3684370   
14                            01-021004950-M  rs3716569   
15                            01-022154835-M  rs3711079   
19                            01-026072256-M  rs3666554   
20                            01-027172163-M  rs3717497   
22                            01-029142332-M  rs3725641   
23                            01-030151053-M  rs3695988   
26                            01-032969697-M  rs3707642   
27                        

In [150]:
# Find out which chromosomes differ most between 129 and B6
equ_smpl = pd.concat([eq['JAX SNP ID (chromosome-position-source)'],eq['129S1/SvImJ'], 
                      eq['C57BL/6ByJ']], axis = 1, keys = ['Chr No', '129', 'C57B6'])
equ_smpl.head()

Unnamed: 0,Chr No,129,C57B6
1,01-004147733-M,A,A
2,01-004787554-N,T,T
4,01-006479686-N,T,T
9,01-011359031-N,G,G
16,01-023061064-M,C,C


In [152]:
exper = equ_smpl.replace(equ_smpl['Chr No'], eq)
print(equ_smpl.head())

            Chr No 129 C57B6
1   01-004147733-M   A     A
2   01-004787554-N   T     T
4   01-006479686-N   T     T
9   01-011359031-N   G     G
16  01-023061064-M   C     C


In [140]:
a = 'scowl'
print(a)
a = a[0:2]
print(a)

scowl
sc
