Skip to content

Commit

Permalink
Retain all code red & blue mutations
Browse files Browse the repository at this point in the history
Addresses cognoma#2 -- add additional mutation effects. Added all
red & blue mutations from http://xena.ucsc.edu/how-we-characterize-mutations/
that were present in the data.
  • Loading branch information
dhimmel committed Jul 15, 2016
1 parent 0239cba commit ffe66ab
Show file tree
Hide file tree
Showing 7 changed files with 1,376 additions and 1,328 deletions.
124 changes: 78 additions & 46 deletions 2.TCGA-process.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert SNP mutations to gene mutations"
"### Convert SNP mutations to gene mutations\n",
"\n",
"The next cell specifies which mutations to preserve as gene-affecting, which were chosen according to the red & blue [mutation effects in Xena](http://xena.ucsc.edu/how-we-characterize-mutations/)."
]
},
{
Expand All @@ -386,8 +388,16 @@
"outputs": [],
"source": [
"mutations = {\n",
" 'Frame_Shift_Del',\n",
" 'Frame_Shift_Ins',\n",
" 'In_Frame_Del',\n",
" 'In_Frame_Ins',\n",
" 'Missense_Mutation',\n",
" 'Nonsense_Mutation',\n",
" 'Nonstop_Mutation',\n",
" 'RNA',\n",
" 'Splice_Site',\n",
" 'Translation_Start_Site',\n",
"}"
]
},
Expand All @@ -397,6 +407,29 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{\"3'UTR\", \"5'Flank\", \"5'UTR\", 'IGR', 'Intron', 'Silent'}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Mutations effects that were observed but nut included\n",
"set(snp_mutation_df.effect.unique()) - mutations"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -434,7 +467,7 @@
"1 TCGA-02-0003-01 ANAPC4 1"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -452,18 +485,18 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(8499, 22256)"
"(8508, 30236)"
]
},
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -479,18 +512,18 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0.57% sample-gene pairs are mutated'"
"'0.50% sample-gene pairs are mutated'"
]
},
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -502,7 +535,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {
"collapsed": false
},
Expand All @@ -523,42 +556,42 @@
" <tr>\n",
" <th>0</th>\n",
" <td>TP53</td>\n",
" <td>2390</td>\n",
" <td>2992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TTN</td>\n",
" <td>2378</td>\n",
" <td>2465</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MUC16</td>\n",
" <td>1468</td>\n",
" <td>1518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PIK3CA</td>\n",
" <td>1000</td>\n",
" <td>1024</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CSMD3</td>\n",
" <td>925</td>\n",
" <td>989</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index gene\n",
"0 TP53 2390\n",
"1 TTN 2378\n",
"2 MUC16 1468\n",
"3 PIK3CA 1000\n",
"4 CSMD3 925"
"0 TP53 2992\n",
"1 TTN 2465\n",
"2 MUC16 1518\n",
"3 PIK3CA 1024\n",
"4 CSMD3 989"
]
},
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -570,7 +603,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {
"collapsed": false
},
Expand All @@ -591,42 +624,42 @@
" <tr>\n",
" <th>0</th>\n",
" <td>TCGA-IB-7651-01</td>\n",
" <td>8047</td>\n",
" <td>8369</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCGA-FW-A3R5-06</td>\n",
" <td>7244</td>\n",
" <td>7772</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TCGA-AP-A0LM-01</td>\n",
" <td>7024</td>\n",
" <td>7227</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TCGA-AP-A059-01</td>\n",
" <td>6426</td>\n",
" <td>6660</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TCGA-B5-A0JY-01</td>\n",
" <td>6113</td>\n",
" <td>6338</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" index sample_id\n",
"0 TCGA-IB-7651-01 8047\n",
"1 TCGA-FW-A3R5-06 7244\n",
"2 TCGA-AP-A0LM-01 7024\n",
"3 TCGA-AP-A059-01 6426\n",
"4 TCGA-B5-A0JY-01 6113"
"0 TCGA-IB-7651-01 8369\n",
"1 TCGA-FW-A3R5-06 7772\n",
"2 TCGA-AP-A0LM-01 7227\n",
"3 TCGA-AP-A059-01 6660\n",
"4 TCGA-B5-A0JY-01 6338"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -647,7 +680,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {
"collapsed": false
},
Expand All @@ -658,7 +691,7 @@
"(10459, 20501)"
]
},
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -672,12 +705,11 @@
"expr_df = (expr_df\n",
" # Remove genes containing a `?`\n",
" [~expr_df.index.str.contains('?', regex=False)]\n",
" # Sort samples\n",
" .sort_index()\n",
" # Transpose so the data is sample × gene\n",
" .transpose()\n",
" # Sort genes\n",
" .sort_index()\n",
" # Sort rows and columns\n",
" .sort_index(axis='rows')\n",
" .sort_index(axis='columns')\n",
")\n",
"\n",
"expr_df.index.rename('sample_id', inplace=True)\n",
Expand All @@ -687,7 +719,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -770,7 +802,7 @@
"TCGA-02-2486-01 6.7716 0.0 2.3973 7.5814 15.3224"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -791,18 +823,18 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"7698"
"7706"
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -814,7 +846,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {
"collapsed": false
},
Expand All @@ -836,7 +868,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {
"collapsed": false
},
Expand All @@ -858,7 +890,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"metadata": {
"collapsed": false
},
Expand Down
100 changes: 50 additions & 50 deletions data/subset/expression-matrix-all-genes.tsv

Large diffs are not rendered by default.

Loading

1 comment on commit ffe66ab

@dhimmel
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I posted data/expression-matrix.tsv.bz2 and data/mutation-matrix.tsv.bz2 created in this commit to figshare.

Please sign in to comment.