### Count number of lines

In [1]:
! wc -l "./new_data_from_peter/graphic-idea-2.tsv"

 7999317 ./new_data_from_peter/graphic-idea-2.tsv


### Import Pandas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("max_columns", 50)
%matplotlib inline

### Load the csv into a DataFrame

In [3]:
cols = ["taxId", "geneId", "year", "symbol", "geneName", "geneType", "citations"]

geneDf = pd.read_table("./new_data_from_peter/graphic-idea-2.tsv", header=None, names=cols)

geneDf.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
0,287,20712569,2011,D703_p4014,hypothetical protein,protein-coding,1
1,359,874703,2001,riorf119,riorf119,protein-coding,1
2,394,7789576,2009,NGR_b00360,hypothetical protein,protein-coding,1
3,562,20466744,2015,orf00008,hypothetical protein,protein-coding,1
4,573,15230191,2015,D647_p21159,adenine-specific methyltransferase,protein-coding,1


### Examine the Genes

#### Gene: CD4

In [5]:
cd4 = geneDf[geneDf["symbol"] == "CD4"]
cd4.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
2455,9796,100052502,2015,CD4,CD4 molecule,protein-coding,1
158089,225400,102760024,2012,CD4,CD4 molecule,protein-coding,1
165357,9031,395362,1999,CD4,CD4 molecule,protein-coding,1
187993,9606,920,1987,CD4,CD4 molecule,protein-coding,22
187994,9606,920,2001,CD4,CD4 molecule,protein-coding,46


In [14]:
print("Total number of citations: " + str(cd4["citations"].sum()))
print("Number of entries: " + str(cd4["citations"].count()))
print("First year: " + str(cd4["year"].min()))
print("Last year: " + str(cd4["year"].max()))

Total number of citations: 2001
Number of entries: 81
First year: 1984
Last year: 2017


In [74]:
cd4_by_year = cd4.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(cd4_by_year.count()[10:15]))
print("Number of citations: " + str(cd4_by_year.sum()[10:15]))

Number of entries:       citations
year           
1994          2
1995          1
1996          1
1997          3
1998          3
Number of citations:       citations
year           
1994         41
1995         50
1996         81
1997         39
1998         48


In [75]:
cd4_sum = cd4_by_year.sum().reset_index()
cd4_sum.rename(columns={"citations":"cd4"}, inplace=True)
cd4_sum.head()

Unnamed: 0,year,cd4
0,1984,1
1,1985,5
2,1986,12
3,1987,22
4,1988,18


#### Gene: TP53

In [15]:
tp53 = geneDf[geneDf["symbol"] == "TP53"]
tp53.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
89648,9606,7157,1984,TP53,tumor protein p53,protein-coding,1
89649,9606,7157,2002,TP53,tumor protein p53,protein-coding,338
270182,9615,403869,1997,TP53,tumor protein p53,protein-coding,1
270183,9615,403869,2015,TP53,tumor protein p53,protein-coding,3
368531,9606,7157,2003,TP53,tumor protein p53,protein-coding,340


In [16]:
print("Total number of citations: " + str(tp53["citations"].sum()))
print("Number of entries: " + str(tp53["citations"].count()))
print("First year: " + str(tp53["year"].min()))
print("Last year: " + str(tp53["year"].max()))

Total number of citations: 8572
Number of entries: 107
First year: 1976
Last year: 2017


In [76]:
tp53_by_year = tp53.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(tp53_by_year.count()[10:15]))
print("Number of citations: " + str(tp53_by_year.sum()[10:15]))

Number of entries:       citations
year           
1993          3
1994          4
1995          4
1996          4
1997          3
Number of citations:       citations
year           
1993         19
1994         25
1995         19
1996         28
1997         31


In [77]:
tp53_sum = tp53_by_year.sum().reset_index()
tp53_sum.rename(columns={"citations":"tp53"}, inplace=True)
tp53_sum.head()

Unnamed: 0,year,tp53
0,1976,1
1,1983,1
2,1984,1
3,1985,3
4,1986,4


#### Gene: GRB2

In [21]:
grb2 = geneDf[geneDf["symbol"] == "GRB2"]
grb2.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
44667,9606,2885,1994,GRB2,growth factor receptor bound protein 2,protein-coding,40
44668,9606,2885,2008,GRB2,growth factor receptor bound protein 2,protein-coding,20
323082,9606,2885,1995,GRB2,growth factor receptor bound protein 2,protein-coding,38
601968,9606,2885,1992,GRB2,growth factor receptor bound protein 2,protein-coding,2
686527,9823,100192436,2005,GRB2,growth factor receptor bound protein 2,protein-coding,1


In [22]:
print("Total number of citations: " + str(grb2["citations"].sum()))
print("Number of entries: " + str(grb2["citations"].count()))
print("First year: " + str(grb2["year"].min()))
print("Last year: " + str(grb2["year"].max()))

Total number of citations: 772
Number of entries: 52
First year: 1977
Last year: 2017


In [78]:
grb2_by_year = grb2.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(grb2_by_year.count()[10:15]))
print("Number of citations: " + str(grb2_by_year.sum()[10:15]))

Number of entries:       citations
year           
2001          1
2002          2
2003          2
2004          2
2005          2
Number of citations:       citations
year           
2001         50
2002         47
2003         24
2004         26
2005         26


In [79]:
grb2_sum = grb2_by_year.sum().reset_index()
grb2_sum.rename(columns={"citations":"grb2"}, inplace=True)
grb2_sum.head()

Unnamed: 0,year,grb2
0,1977,1
1,1992,2
2,1993,8
3,1994,41
4,1995,38


#### Gene: HBB

In [17]:
hbb = geneDf[geneDf["symbol"] == "HBB"]
hbb.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
24127,9541,101926697,2011,HBB,"hemoglobin, beta",protein-coding,1
52181,9598,450978,1985,HBB,hemoglobin subunit beta,protein-coding,1
111260,9606,3043,2009,HBB,hemoglobin subunit beta,protein-coding,48
126385,9823,407066,2003,HBB,"hemoglobin, beta",protein-coding,1
297213,9606,3043,1979,HBB,hemoglobin subunit beta,protein-coding,2


In [18]:
print("Total number of citations: " + str(hbb["citations"].sum()))
print("Number of entries: " + str(hbb["citations"].count()))
print("First year: " + str(hbb["year"].min()))
print("Last year: " + str(hbb["year"].max()))

Total number of citations: 758
Number of entries: 89
First year: 1961
Last year: 2017


In [80]:
hbb_by_year = hbb.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(hbb_by_year.count()[10:15]))
print("Number of citations: " + str(hbb_by_year.sum()[10:15]))

Number of entries:       citations
year           
1977          2
1978          2
1979          1
1980          2
1981          3
Number of citations:       citations
year           
1977          7
1978          5
1979          2
1980          8
1981         13


In [81]:
hbb_sum = hbb_by_year.sum().reset_index()
hbb_sum.rename(columns={"citations":"hbb"}, inplace=True)
hbb_sum.head()

Unnamed: 0,year,hbb
0,1961,1
1,1965,1
2,1967,1
3,1968,3
4,1970,1


#### Gene: TNF

In [23]:
tnf = geneDf[geneDf["symbol"] == "TNF"]
tnf.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
93352,419612,102510424,2012,TNF,tumor necrosis factor,protein-coding,1
103814,9606,7124,2000,TNF,tumor necrosis factor,protein-coding,30
118068,296587,8248308,2009,TNF,receptor-like cell wall protein,protein-coding,1
152303,9823,397086,1989,TNF,tumor necrosis factor,protein-coding,1
325292,9544,715467,2011,TNF,tumor necrosis factor,protein-coding,1


In [24]:
print("Total number of citations: " + str(tnf["citations"].sum()))
print("Number of entries: " + str(tnf["citations"].count()))
print("First year: " + str(tnf["year"].min()))
print("Last year: " + str(tnf["year"].max()))

Total number of citations: 5499
Number of entries: 140
First year: 1984
Last year: 2017


In [82]:
tnf_by_year = tnf.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(tnf_by_year.count()[10:15]))
print("Number of citations: " + str(tnf_by_year.sum()[10:15]))

Number of entries:       citations
year           
1994          2
1995          4
1996          1
1997          2
1998          1
Number of citations:       citations
year           
1994          9
1995         18
1996          7
1997         15
1998         13


In [83]:
tnf_sum = tnf_by_year.sum().reset_index()
tnf_sum.rename(columns={"citations":"tnf"}, inplace=True)
tnf_sum.head()

Unnamed: 0,year,tnf
0,1984,1
1,1985,4
2,1986,5
3,1987,1
4,1988,4


#### Gene: APOE

In [26]:
apoe = geneDf[geneDf["symbol"] == "APOE"]
apoe.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
3623,9913,281004,2002,APOE,apolipoprotein E,protein-coding,2
17679,9606,348,2009,APOE,apolipoprotein E,protein-coding,351
282088,9913,281004,2003,APOE,apolipoprotein E,protein-coding,2
296180,9606,348,1998,APOE,apolipoprotein E,protein-coding,6
296181,9606,348,2008,APOE,apolipoprotein E,protein-coding,327


In [27]:
print("Total number of citations: " + str(apoe["citations"].sum()))
print("Number of entries: " + str(apoe["citations"].count()))
print("First year: " + str(apoe["year"].min()))
print("Last year: " + str(apoe["year"].max()))

Total number of citations: 4021
Number of entries: 79
First year: 1973
Last year: 2017


In [84]:
apoe_by_year = apoe.filter(["year","citations"], axis=1).groupby("year")
print("Number of entries: " + str(apoe_by_year.count()[10:15]))
print("Number of citations: " + str(apoe_by_year.sum()[10:15]))

Number of entries:       citations
year           
1987          2
1988          1
1989          3
1990          2
1991          2
Number of citations:       citations
year           
1987          2
1988          3
1989          9
1990          4
1991          5


In [85]:
apoe_sum = apoe_by_year.sum().reset_index()
apoe_sum.rename(columns={"citations":"apoe"}, inplace=True)
apoe_sum.head()

Unnamed: 0,year,apoe
0,1973,1
1,1978,2
2,1979,1
3,1980,1
4,1981,1


### Join the summed data frames for CD4, TP53, GRB2, HBB, TNF and APOE on year

In [107]:
joined_all = pd.merge(cd4_sum, tp53_sum, on="year", how="outer")
joined_all = joined_all.merge(grb2_sum, on="year", how="outer")
joined_all = joined_all.merge(hbb_sum, on="year", how="outer")
joined_all = joined_all.merge(tnf_sum, on="year", how="outer")
joined_all = joined_all.merge(apoe_sum, on="year", how="outer")
joined_all.tail()

Unnamed: 0,year,cd4,tp53,grb2,hbb,tnf,apoe
46,1978,,,,5,,2.0
47,1979,,,,2,,1.0
48,1980,,,,8,,1.0
49,1981,,,,13,,1.0
50,1982,,,,9,,4.0


### Print out a csv

In [108]:
joined_all.to_csv("./joined_all.csv")

### How many unique genes symbols do we have?

In [30]:
geneDf["symbol"].unique().size

5366161

### How many unique human gene symbols do we have?

In [42]:
geneDf[geneDf["taxId"] == 9606]["symbol"].unique().size

35717

### Create a function to handle summing the citations by year

In [129]:
def sumByYear(df_in, geneSymbol):
    """Takes a data frame of lots of genes, selects our chosen gene by geneSymbol
    groups by year, sums the citations and the returns a new data frame
    """
    df_gene = df_in[df_in["symbol"] == geneSymbol]
    
    print("\n")
    print(geneSymbol + " total number of citations: " + str(df_gene["citations"].sum()))
    print(geneSymbol + " number of entries: " + str(df_gene["citations"].count()))
    print(geneSymbol + " first year: " + str(df_gene["year"].min()))
    print(geneSymbol + " last year: " + str(df_gene["year"].max()))
    print("\n")
    
    df_gene_by_year = df_gene.filter(["year","citations"], axis=1).groupby("year")
    
    df_gene_sum = df_gene_by_year.sum().reset_index()
    df_gene_sum.rename(columns={"citations":geneSymbol}, inplace=True)
    
    return df_gene_sum

### Sum each gene for all species as well as just human

In [150]:
cd4_summed = sumByYear(geneDf, "CD4")
tp53_summed = sumByYear(geneDf, "TP53")
grb2_summed = sumByYear(geneDf, "GRB2")
hbb_summed = sumByYear(geneDf, "HBB")
tnf_summed = sumByYear(geneDf, "TNF")
apoe_summed = sumByYear(geneDf, "APOE")



CD4 total number of citations: 2001
CD4 number of entries: 81
CD4 first year: 1984
CD4 last year: 2017




TP53 total number of citations: 8572
TP53 number of entries: 107
TP53 first year: 1976
TP53 last year: 2017




GRB2 total number of citations: 772
GRB2 number of entries: 52
GRB2 first year: 1977
GRB2 last year: 2017




HBB total number of citations: 758
HBB number of entries: 89
HBB first year: 1961
HBB last year: 2017




TNF total number of citations: 5499
TNF number of entries: 140
TNF first year: 1984
TNF last year: 2017




APOE total number of citations: 4021
APOE number of entries: 79
APOE first year: 1973
APOE last year: 2017




In [151]:
cd4_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "CD4")
tp53_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "TP53")
grb2_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "GRB2")
hbb_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "HBB")
tnf_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "TNF")
apoe_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "APOE")



CD4 total number of citations: 1953
CD4 number of entries: 34
CD4 first year: 1984
CD4 last year: 2017




TP53 total number of citations: 8479
TP53 number of entries: 35
TP53 first year: 1976
TP53 last year: 2017




GRB2 total number of citations: 747
GRB2 number of entries: 27
GRB2 first year: 1977
GRB2 last year: 2017




HBB total number of citations: 712
HBB number of entries: 46
HBB first year: 1961
HBB last year: 2017




TNF total number of citations: 5314
TNF number of entries: 34
TNF first year: 1984
TNF last year: 2017




APOE total number of citations: 3977
APOE number of entries: 40
APOE first year: 1973
APOE last year: 2017




### Get the total amount of citations for each year for all species and just human genes

In [165]:
all_genes_by_year = geneDf.filter(["year","citations"], axis=1).groupby("year")
all_genes_by_year_summed = all_genes_by_year.sum().reset_index()
all_genes_by_year_summed.tail()

Unnamed: 0,year,citations
92,2013,759035
93,2014,954697
94,2015,721942
95,2016,647288
96,2017,147027


In [166]:
human_genes_by_year = geneDf[geneDf["taxId"] == 9606].filter(["year","citations"], axis=1).groupby("year")
human_genes_by_year_summed = human_genes_by_year.sum().reset_index()
human_genes_by_year_summed.tail()

Unnamed: 0,year,citations
64,2013,81353
65,2014,84919
66,2015,93137
67,2016,51633
68,2017,29441


### Join the total and gene specific sums together

In [169]:
summed_all = pd.merge(cd4_summed, tp53_summed, on="year", how="outer")
summed_all = joined_all.merge(grb2_summed, on="year", how="outer")
summed_all = joined_all.merge(hbb_summed, on="year", how="outer")
summed_all = joined_all.merge(tnf_summed, on="year", how="outer")
summed_all = joined_all.merge(apoe_summed, on="year", how="outer")
summed_all = joined_all.merge(all_genes_by_year_summed, on="year", how="outer")
summed_all_fill = summed_all.fillna(0).sort_values("year")
summed_all_fill_sort.head()

Unnamed: 0,year,cd4,tp53,grb2,hbb,tnf,apoe,citations
51,1920,0.0,0.0,0.0,0.0,0.0,0.0,3
52,1921,0.0,0.0,0.0,0.0,0.0,0.0,7
53,1923,0.0,0.0,0.0,0.0,0.0,0.0,4
54,1924,0.0,0.0,0.0,0.0,0.0,0.0,17
55,1925,0.0,0.0,0.0,0.0,0.0,0.0,4


In [170]:
summed_human = pd.merge(cd4_human_summed, tp53_human_summed, on="year", how="outer")
summed_human = joined_all.merge(grb2_human_summed, on="year", how="outer")
summed_human = joined_all.merge(hbb_human_summed, on="year", how="outer")
summed_human = joined_all.merge(tnf_human_summed, on="year", how="outer")
summed_human = joined_all.merge(apoe_human_summed, on="year", how="outer")
summed_human = joined_all.merge(human_genes_by_year_summed, on="year", how="outer")
summed_human_fill = summed_human.fillna(0).sort_values("year")
summed_human_fill.head()

Unnamed: 0,year,cd4,tp53,grb2,hbb,tnf,apoe,citations
51,1930,0.0,0.0,0.0,0.0,0.0,0.0,3
52,1946,0.0,0.0,0.0,0.0,0.0,0.0,1
53,1948,0.0,0.0,0.0,0.0,0.0,0.0,1
54,1950,0.0,0.0,0.0,0.0,0.0,0.0,1
55,1952,0.0,0.0,0.0,0.0,0.0,0.0,1


### Export to CSV

In [171]:
summed_all_fill.to_csv("./summed_all_fill.csv")
summed_human_fill.to_csv("./summed_human_fill.csv")