### Count number of lines

In [1]:
! wc -l "./data/graphic-idea-2.tsv"

 7999317 ./data/graphic-idea-2.tsv


### Import Pandas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("max_columns", 50)
%matplotlib inline

### Load the csv into a DataFrame

In [3]:
cols = ["taxId", "geneId", "year", "symbol", "geneName", "geneType", "citations"]

geneDf = pd.read_table("./data/graphic-idea-2.tsv", header=None, names=cols)

geneDf.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
0,287,20712569,2011,D703_p4014,hypothetical protein,protein-coding,1
1,359,874703,2001,riorf119,riorf119,protein-coding,1
2,394,7789576,2009,NGR_b00360,hypothetical protein,protein-coding,1
3,562,20466744,2015,orf00008,hypothetical protein,protein-coding,1
4,573,15230191,2015,D647_p21159,adenine-specific methyltransferase,protein-coding,1


### Examine the Genes

How many unique genes symbols do we have?

In [4]:
geneDf["symbol"].unique().size

5366161

How many unique human gene symbols do we have?

In [5]:
geneDf[geneDf["taxId"] == 9606]["symbol"].unique().size

35717

### Create a function to handle summing the citations by year

In [6]:
def sumByYear(df_in, geneSymbol):
    """Takes a data frame of lots of genes, selects our chosen gene by geneSymbol
    groups by year, sums the citations and the returns a new data frame
    """
    df_gene = df_in[df_in["symbol"] == geneSymbol]
    
    print("\n")
    print(geneSymbol + " total number of citations: " + str(df_gene["citations"].sum()))
    print(geneSymbol + " number of entries: " + str(df_gene["citations"].count()))
    print(geneSymbol + " first year: " + str(df_gene["year"].min()))
    print(geneSymbol + " last year: " + str(df_gene["year"].max()))
    print("\n")
    
    df_gene_by_year = df_gene.filter(["year","citations"], axis=1).groupby("year")
    
    df_gene_sum = df_gene_by_year.sum().reset_index()
    df_gene_sum.rename(columns={"citations":geneSymbol}, inplace=True)
    
    return df_gene_sum

### Sum each gene for all species

In [7]:
cd4_summed = sumByYear(geneDf, "CD4")
tp53_summed = sumByYear(geneDf, "TP53")
grb2_summed = sumByYear(geneDf, "GRB2")
hbb_summed = sumByYear(geneDf, "HBB")
tnf_summed = sumByYear(geneDf, "TNF")
apoe_summed = sumByYear(geneDf, "APOE")



CD4 total number of citations: 2001
CD4 number of entries: 81
CD4 first year: 1984
CD4 last year: 2017




TP53 total number of citations: 8572
TP53 number of entries: 107
TP53 first year: 1976
TP53 last year: 2017




GRB2 total number of citations: 772
GRB2 number of entries: 52
GRB2 first year: 1977
GRB2 last year: 2017




HBB total number of citations: 758
HBB number of entries: 89
HBB first year: 1961
HBB last year: 2017




TNF total number of citations: 5499
TNF number of entries: 140
TNF first year: 1984
TNF last year: 2017




APOE total number of citations: 4021
APOE number of entries: 79
APOE first year: 1973
APOE last year: 2017




### Sum each gene for just human genes

In [8]:
cd4_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "CD4")
tp53_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "TP53")
grb2_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "GRB2")
hbb_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "HBB")
tnf_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "TNF")
apoe_human_summed = sumByYear(geneDf[geneDf["taxId"] == 9606], "APOE")



CD4 total number of citations: 1953
CD4 number of entries: 34
CD4 first year: 1984
CD4 last year: 2017




TP53 total number of citations: 8479
TP53 number of entries: 35
TP53 first year: 1976
TP53 last year: 2017




GRB2 total number of citations: 747
GRB2 number of entries: 27
GRB2 first year: 1977
GRB2 last year: 2017




HBB total number of citations: 712
HBB number of entries: 46
HBB first year: 1961
HBB last year: 2017




TNF total number of citations: 5314
TNF number of entries: 34
TNF first year: 1984
TNF last year: 2017




APOE total number of citations: 3977
APOE number of entries: 40
APOE first year: 1973
APOE last year: 2017




### Get the total amount of citations for each year for all species and just human genes

In [9]:
all_genes_by_year = geneDf.filter(["year","citations"], axis=1).groupby("year")
all_genes_by_year_summed = all_genes_by_year.sum().reset_index()
all_genes_by_year_summed.tail()

Unnamed: 0,year,citations
92,2013,759035
93,2014,954697
94,2015,721942
95,2016,647288
96,2017,147027


In [10]:
human_genes_by_year = geneDf[geneDf["taxId"] == 9606].filter(["year","citations"], axis=1).groupby("year")
human_genes_by_year_summed = human_genes_by_year.sum().reset_index()
human_genes_by_year_summed.tail()

Unnamed: 0,year,citations
64,2013,81353
65,2014,84919
66,2015,93137
67,2016,51633
68,2017,29441


### Join the total and gene specific sums together

In [11]:
summed_all = pd.merge(cd4_summed, tp53_summed, on="year", how="outer")
summed_all = summed_all.merge(grb2_summed, on="year", how="outer")
summed_all = summed_all.merge(hbb_summed, on="year", how="outer")
summed_all = summed_all.merge(tnf_summed, on="year", how="outer")
summed_all = summed_all.merge(apoe_summed, on="year", how="outer")
summed_all = summed_all.merge(all_genes_by_year_summed, on="year", how="outer")
summed_all_fill = summed_all.fillna(0).sort_values("year")
summed_all_fill.head()

Unnamed: 0,year,CD4,TP53,GRB2,HBB,TNF,APOE,citations
51,1920,0.0,0.0,0.0,0.0,0.0,0.0,3
52,1921,0.0,0.0,0.0,0.0,0.0,0.0,7
53,1923,0.0,0.0,0.0,0.0,0.0,0.0,4
54,1924,0.0,0.0,0.0,0.0,0.0,0.0,17
55,1925,0.0,0.0,0.0,0.0,0.0,0.0,4


In [12]:
summed_human = pd.merge(cd4_human_summed, tp53_human_summed, on="year", how="outer")
summed_human = summed_all.merge(grb2_human_summed, on="year", how="outer")
summed_human = summed_all.merge(hbb_human_summed, on="year", how="outer")
summed_human = summed_all.merge(tnf_human_summed, on="year", how="outer")
summed_human = summed_all.merge(apoe_human_summed, on="year", how="outer")
summed_human = summed_all.merge(human_genes_by_year_summed, on="year", how="outer")
summed_human_fill = summed_human.fillna(0).sort_values("year")
summed_human_fill.head()

Unnamed: 0,year,CD4,TP53,GRB2,HBB,TNF,APOE,citations_x,citations_y
51,1920,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0
52,1921,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0
53,1923,0.0,0.0,0.0,0.0,0.0,0.0,4,0.0
54,1924,0.0,0.0,0.0,0.0,0.0,0.0,17,0.0
55,1925,0.0,0.0,0.0,0.0,0.0,0.0,4,0.0


### Export to CSV

In [13]:
summed_all_fill.to_csv("./summed_all_fill.csv")
summed_human_fill.to_csv("./summed_human_fill.csv")

### Which are the most cited genes?

In [14]:
all_genes = geneDf.filter(["symbol","citations"], axis=1).groupby("symbol")
all_genes_summed = all_genes.sum().reset_index().sort_values("citations").reset_index().drop("index", 1)
all_genes_summed.tail(20)

Unnamed: 0,symbol,citations
5366140,APOE,4021
5366141,IL6,4033
5366142,VEGFA,4241
5366143,ATP8,4545
5366144,ND4L,4725
5366145,EGFR,4756
5366146,ND6,4779
5366147,ND3,4785
5366148,COX3,4808
5366149,ND4,4817


### Export a CSV of the top 50 genes

In [15]:
all_genes_summed.tail(50).to_csv("./all_genes_summed_50.csv")

### Total number of citations

In [16]:
"{:,}".format(all_genes_summed["citations"].sum())

'10,693,687'

### Which are the most cited human genes?

In [17]:
all_genes_human = geneDf[geneDf["taxId"] == 9606].filter(["symbol","citations"], axis=1).groupby("symbol")
all_genes_human_summed = all_genes_human.sum().reset_index().sort_values("citations").reset_index().drop("index", 1)
all_genes_human_summed.tail(20)

Unnamed: 0,symbol,citations
35697,LOC110806262,2408
35698,ACE,2416
35699,ERBB2,2448
35700,IL1B,2461
35701,MMP9,2482
35702,BRCA1,2530
35703,HLA-DRB1,2561
35704,HIF1A,2589
35705,IL10,2715
35706,NFKB1,2757


### Export a CSV of the top 50 genes

In [25]:
all_genes_human_summed.tail(50).to_csv("./all_genes_human_summed_50.csv")

### Export a CSV of the top 100 genes

In [26]:
all_genes_human_summed.tail(100).to_csv("./all_genes_human_summed_100.csv")

### Total number of citations

In [19]:
"{:,}".format(all_genes_human_summed["citations"].sum())

'1,233,229'

### Which are the most cited Species?

In [20]:
all_species = geneDf.filter(["taxId","citations"], axis=1).groupby("taxId")
all_species_summed = all_species.sum().reset_index().sort_values("citations").reset_index().drop("index", 1)
all_species_summed.tail(20)

Unnamed: 0,taxId,citations
13662,280463,38544
13663,353153,39307
13664,3694,41575
13665,49451,42275
13666,46245,49091
13667,6239,53273
13668,208964,57965
13669,9913,59264
13670,412133,59512
13671,4577,64779


### Total number of citations by species 

In [21]:
"{:,}".format(all_species_summed["citations"].sum())

'10,693,813'

### Total number of citations in whole document

In [27]:
"{:,}".format(geneDf["citations"].sum())

'10,693,813'