<a href="https://colab.research.google.com/github/chuducthang77/coronavirus/blob/main/Recurring_mutation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem: 
We’ll identify mutations as re-occurred mutations if they happened earlier, then probably disappeared and again come back in some other virus sequences. By definition, recurring means something that happens over and over again at regular intervals. The timeline is 3 months. That means if a mutation 1st occurred in Jan 2020 and no mutations in Feb, Mar, and April and then appeared again in May and repeat the gap

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd 'gdrive/MyDrive/Machine Learning/coronavirus/analysis'
!ls

/content/gdrive/MyDrive/Machine Learning/coronavirus/analysis
H1N_H9N			       output.csv
Mutation_analysis.ipynb        Recurring_mutation.ipynb
mutations_spike_msa_apr21.csv


In [3]:
import pandas as pd
import numpy as np

In [29]:
#Read the csv
df = pd.read_csv('mutations_spike_msa_apr21.csv')
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6484,,,,,
6485,,,,,
6486,,,,,
6487,,,,,


In [30]:
#Eliminate the empty row at the end of the file
df = df[df['Collection Date'].notna()]
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957


In [31]:
#Create the column Month based on Collection Date
pd.options.mode.chained_assignment = None
dates = pd.to_datetime(df['Collection Date'], format='%Y-%m-%d')
dates = dates.dt.strftime('%m')
df['Month'] = dates
df['Month'] = df['Month'].astype(str).astype(int)
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Month
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,1
2,QUJ34653.1,2020-01-01,USA,D614G,614,1
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614,1
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144,1
...,...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701,4
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614,4
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957,4
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957,4


In [32]:
#Convert the mutations columns to the list and expand for each item in the list to individual row
df = df.assign(names=df['Mutations'].str.split(',')).explode('names')
df = df.rename(columns={'names': 'Individual mutation'})
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Month,Individual mutation
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,S477N
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,D614G
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,A930V
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,1,P809S
2,QUJ34653.1,2020-01-01,USA,D614G,614,1,D614G
...,...,...,...,...,...,...,...
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,E484K
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,D614G
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,H655Y
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,T1027I


In [48]:
#Check the mutation for the given interval
intervals = [{1,5,9},{2,6,10},{3,7,11},{4,8,12}]
result = []
for interval in intervals:
  
  #Group the mutation by their individual mutation and create the set of month it occurs
  temp_df = df.groupby('Individual mutation')['Month'].apply(set).reset_index()

  #Keep only the set matching the given interval
  temp_df = temp_df[temp_df['Month'] == interval]

  #Keep only the unique individual mutation
  result += list(temp_df['Individual mutation'].unique())

print(result)

['Y144H', 'R237S', 'S1147A']


In [50]:
#Save the output to the txt file
with open('output.txt', 'w') as output:
  output.write(str(result))