### Import Pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("max_columns", 50)
%matplotlib inline

### Load the csv into a DataFrame

In [2]:
cols = ["taxId", "geneId", "year", "symbol", "geneName", "geneType", "citations"]

geneDf = pd.read_table("./data/graphic-idea-2.tsv", header=None, names=cols)

geneDf.head()

Unnamed: 0,taxId,geneId,year,symbol,geneName,geneType,citations
0,287,20712569,2011,D703_p4014,hypothetical protein,protein-coding,1
1,359,874703,2001,riorf119,riorf119,protein-coding,1
2,394,7789576,2009,NGR_b00360,hypothetical protein,protein-coding,1
3,562,20466744,2015,orf00008,hypothetical protein,protein-coding,1
4,573,15230191,2015,D647_p21159,adenine-specific methyltransferase,protein-coding,1


### Get the number of human genes cited for each year

In [7]:
human_genes_by_year = geneDf[geneDf["taxId"] == 9606].filter(["year","citations"], axis=1).groupby("year")
human_genes_by_year_summed = human_genes_by_year.count().reset_index()
human_genes_by_year_summed.rename(columns={"citations":"number_of_human_genes_cited"}, inplace=True)
human_genes_by_year_summed.tail()

Unnamed: 0,year,number_of_human_genes_cited
64,2013,13834
65,2014,14803
66,2015,15234
67,2016,11168
68,2017,13748


In [8]:
human_genes_by_year_summed.head()

Unnamed: 0,year,number_of_human_genes_cited
0,1930,3
1,1946,1
2,1948,1
3,1950,1
4,1952,1


### Get the total number of citations for human genes each year

In [9]:
human_cites_by_year = geneDf[geneDf["taxId"] == 9606].filter(["year","citations"], axis=1).groupby("year")
human_cites_by_year_summed = human_cites_by_year.sum().reset_index()
human_cites_by_year_summed.rename(columns={"citations":"number_of_human_gene_citations"}, inplace=True)
human_cites_by_year_summed.tail()

Unnamed: 0,year,number_of_human_gene_citations
64,2013,81353
65,2014,84919
66,2015,93137
67,2016,51633
68,2017,29441


In [10]:
human_cites_by_year_summed.head()

Unnamed: 0,year,number_of_human_gene_citations
0,1930,3
1,1946,1
2,1948,1
3,1950,1
4,1952,1


### Join the two DataFrames

In [12]:
human_gene_citatons_over_time = human_cites_by_year_summed.merge(human_genes_by_year_summed, on="year", how="outer")
human_gene_citatons_over_time.head()

Unnamed: 0,year,number_of_human_gene_citations,number_of_human_genes_cited
0,1930,3,3
1,1946,1,1
2,1948,1,1
3,1950,1,1
4,1952,1,1


In [13]:
human_gene_citatons_over_time.tail()

Unnamed: 0,year,number_of_human_gene_citations,number_of_human_genes_cited
64,2013,81353,13834
65,2014,84919,14803
66,2015,93137,15234
67,2016,51633,11168
68,2017,29441,13748


### Print the file to a csv

In [14]:
human_gene_citatons_over_time.to_csv("./human_gene_citations_over_time.csv")