<a href="https://colab.research.google.com/github/chuducthang77/coronavirus/blob/main/Recurring_mutation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem: 
We’ll identify mutations as re-occurred mutations if they happened earlier, then probably disappeared and again come back in some other virus sequences. By definition, recurring means something that happens over and over again at regular intervals. The timeline is 3 months. That means if a mutation 1st occurred in Jan 2020 and no mutations in Feb, Mar, and April and then appeared again in May and repeat the gap

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd 'gdrive/MyDrive/Machine Learning/coronavirus/analysis'
!ls

/content/gdrive/MyDrive/Machine Learning/coronavirus/analysis
H1N_H9N			       output.csv
Mutation_analysis.ipynb        Recurring_mutation.ipynb
mutations_spike_msa_apr21.csv


In [None]:
import pandas as pd
import numpy as np

In [None]:
#Read the csv
df = pd.read_csv('mutations_spike_msa_apr21.csv')
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6484,,,,,
6485,,,,,
6486,,,,,
6487,,,,,


In [None]:
#Eliminate the empty row at the end of the file
df = df[df['Collection Date'].notna()]
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957


In [None]:
#Create the column Month based on Collection Date
pd.options.mode.chained_assignment = None
dates = pd.to_datetime(df['Collection Date'], format='%Y-%m-%d')
dates = dates.dt.strftime('%m')
df['Month'] = dates
df['Month'] = df['Month'].astype(str).astype(int)
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Month
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,1
2,QUJ34653.1,2020-01-01,USA,D614G,614,1
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614,1
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144,1
...,...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701,4
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614,4
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957,4
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957,4


In [None]:
#Convert the mutations columns to the list and expand for each item in the list to individual row
df = df.assign(names=df['Mutations'].str.split(',')).explode('names')
df = df.rename(columns={'names': 'Individual mutation'})
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Month,Individual mutation
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,S477N
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,D614G
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,1,A930V
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,1,P809S
2,QUJ34653.1,2020-01-01,USA,D614G,614,1,D614G
...,...,...,...,...,...,...,...
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,E484K
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,D614G
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,H655Y
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,4,T1027I


In [None]:
#Check the mutation for the given interval
intervals = [{1,5,9},{2,6,10},{3,7,11},{4,8,12}]
result = []
for interval in intervals:
  
  #Group the mutation by their individual mutation and create the set of month it occurs
  temp_df = df.groupby('Individual mutation')['Month'].apply(set).reset_index()

  #Keep only the set matching the given interval
  temp_df = temp_df[temp_df['Month'] == interval]

  #Keep only the unique individual mutation
  result += list(temp_df['Individual mutation'].unique())

print(result)

['Y144H', 'R237S', 'S1147A']


In [None]:
#Save the output to the txt file
with open('output.txt', 'w') as output:
  output.write(str(result))

# New Solution:
1. Count the frequency of each mutation for each month starting from JAN 2020 TO APR 2021 (16 months). Present it as a vector. (vector of 16 values)

Mutation 1: (p1, p2, p3, ...., p16) where pi is the frequency of mutation 1 in each month. 
Mutation 2: (p'1, p'2, p'3, ...., p'16)where p'i is the frequency of mutation 2 in each month.

 Frequency is calculated as the Occurrence of mutation/Number of total sequences of each month.

2. Calculate the pairwise correlation coefficient between two vectors. (Ref: https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/)

3. Group those mutations in terms of their C.C. values. (low, high, 0). If C.C. is high (1) then the mutations are similar to each other. If low (-1) then the mutations are not related. If 0, then there is no relationship. 

4. Then we need to peak a group and analyze the mutations. 

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd 'gdrive/MyDrive/Machine Learning/coronavirus/analysis'
!ls

/content/gdrive/MyDrive/Machine Learning/coronavirus/analysis
H1N_H9N			       output.csv
Mutation_analysis.ipynb        output.txt
mutations_spike_msa_apr21.csv  Recurring_mutation.ipynb


In [3]:
import pandas as pd
import numpy as np

In [4]:
#Read the csv
df = pd.read_csv('mutations_spike_msa_apr21.csv')
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6484,,,,,
6485,,,,,
6486,,,,,
6487,,,,,


In [5]:
#Eliminate the empty row at the end of the file
df = df[df['Collection Date'].notna()]
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809
2,QUJ34653.1,2020-01-01,USA,D614G,614
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144
...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957


In [7]:
#Create the column Month based on Collection Date
pd.options.mode.chained_assignment = None
dates = pd.to_datetime(df['Collection Date'], format='%Y-%m-%d')
dates = dates.dt.strftime('%Y-%m')
df['Year-Month'] = dates
# df['Year-Month'] = df['Year-Month'].astype(str).astype(int)
df['Year-Month'] = df['Year-Month'].astype(str)
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Year-Month
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,2020-01
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,2020-01
2,QUJ34653.1,2020-01-01,USA,D614G,614,2020-01
3,QPK67588.1,2020-01-01,Australia: Victoria,"S477N,D614G",477614,2020-01
4,QPK73836.1,2020-01-01,Australia: Victoria,"S477N,D614G,E1144Q",4776141144,2020-01
...,...,...,...,...,...,...
6449,QVE49796.1,2021-04-28,USA: Ohio,"E484K,D614G,A701V",484614701,2021-04
6450,QVE49641.1,2021-04-28,USA: Ohio,"S13I,W152C,G257V,L452R,D614G",13152257452614,2021-04
6451,QUW97784.1,2021-04-28,USA: Massachusetts,"L5F,T95I,D253G,W258L,S477N,D614G,Q957R",595253258477614957,2021-04
6452,QUX02295.1,2021-04-28,USA: Maine,"K77N,T95I,F157S,D253G,L452R,S477N,D614G,Q957R",7795157253452477614957,2021-04


In [8]:
#Convert the mutations columns to the list and expand for each item in the list to individual row
df = df.assign(names=df['Mutations'].str.split(',')).explode('names')
df = df.rename(columns={'names': 'Individual mutation'})
df

Unnamed: 0,Accession ID,Collection Date,Location,Mutations,Mutation Positions,Year-Month,Individual mutation
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,2020-01,S477N
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,2020-01,D614G
0,QPF67537.1,2020-01-01,Australia: Victoria,"S477N,D614G,A930V",477614930,2020-01,A930V
1,QLJ57383.1,2020-01-01,USA: WA,P809S,809,2020-01,P809S
2,QUJ34653.1,2020-01-01,USA,D614G,614,2020-01,D614G
...,...,...,...,...,...,...,...
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,2021-04,E484K
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,2021-04,D614G
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,2021-04,H655Y
6453,QVE49761.1,2021-04-29,USA: Ohio,"L18F,T20N,P26S,D138Y,R190S,K417T,E484K,D614G,H...",18202613819041748461465510271176,2021-04,T1027I


In [9]:
#Group the mutation by their individual mutation and create the set of month it occurs
temp_df = df.groupby(['Individual mutation', 'Year-Month']).size().reset_index()
temp_df = temp_df.rename(columns={0: "Count"})
temp_df

Unnamed: 0,Individual mutation,Year-Month,Count
0,A1016S,2020-04,1
1,A1016S,2021-03,1
2,A1016T,2020-10,1
3,A1016T,2020-11,1
4,A1016T,2020-12,1
...,...,...,...
6005,Y837F,2021-03,1
6006,Y837H,2020-12,1
6007,Y837H,2021-02,1
6008,Y837H,2021-03,1


In [33]:
frequency_df = temp_df.pivot_table('Count', 'Individual mutation', 'Year-Month', aggfunc="sum", fill_value=0)
frequency_df

Year-Month,2020-01,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,2021-03,2021-04
Individual mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1016S,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
A1016T,0,0,0,0,0,0,0,0,0,1,1,1,2,1,0,1
A1016V,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1
A1020D,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
A1020S,0,0,1,0,0,0,0,3,0,0,1,1,4,7,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y789D,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Y789H,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Y837F,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
Y837H,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0


In [17]:
frequency_array = frequency_df.to_numpy()
frequency_array

In [28]:
#Calculate row-wise Pearson correlation coefficient matrix
frequency_mf = frequency_array - frequency_array.mean(1)[:, None]

frequency_sq = (frequency_mf**2).sum(1)

pearson_correlation = np.dot(frequency_mf, frequency_mf.T)/np.sqrt(np.dot(frequency_sq[:,None],frequency_sq[None]))
pearson_correlation

array([[ 1.        , -0.27144836,  0.5831297 , ...,  0.68313005,
         0.30261377, -0.09759001],
       [-0.27144836,  1.        , -0.06088061, ..., -0.18543453,
         0.18071682,  0.23841582],
       [ 0.5831297 , -0.06088061,  1.        , ...,  0.88863455,
         0.43708546, -0.09192771],
       ...,
       [ 0.68313005, -0.18543453,  0.88863455, ...,  1.        ,
         0.53748385, -0.06666667],
       [ 0.30261377,  0.18071682,  0.43708546, ...,  0.53748385,
         1.        ,  0.53748385],
       [-0.09759001,  0.23841582, -0.09192771, ..., -0.06666667,
         0.53748385,  1.        ]])

In [32]:
#Categorize the result if correlation coefficient == 1 (strong), correlation coefficient == -1 (no correlation), coefficient correlation == 0 (no relationship)
conditions = [(pearson_correlation == 1), (pearson_correlation == -1), (pearson_correlation == 0)]
choices = ['Strong', 'No correlation', 'No relationship']
result = np.select(conditions, choices, default="None")
result

array([['Strong', 'None', 'None', ..., 'None', 'None', 'None'],
       ['None', 'Strong', 'None', ..., 'None', 'None', 'None'],
       ['None', 'None', 'Strong', ..., 'None', 'None', 'None'],
       ...,
       ['None', 'None', 'None', ..., 'Strong', 'None', 'None'],
       ['None', 'None', 'None', ..., 'None', 'Strong', 'None'],
       ['None', 'None', 'None', ..., 'None', 'None', 'Strong']],
      dtype='<U15')