In [6]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as sf
import plotly.graph_objects as go
from IPython.display import Image
from pi_for_temperature_notebook import *

## Veg6

In [7]:
#load data
veg6_data = pd.read_csv("variant-calls_vegprop-expt_2020-04-07_tables.csv",header=0,index_col=0)
veg6_options = get_veg_prop_args()

#filter SNPs
filtered_veg6_data = pd.concat([veg6_data.loc[veg6_data.freqPropReplicateA >= 0.03],
                           veg6_data.loc[veg6_data.freqPropReplicateB >= 0.03]]).drop_duplicates()

filtered_veg6_data['DP'] = filtered_veg6_data['ADReplicateA'] + filtered_veg6_data['ADReplicateB'] + filtered_veg6_data['RDReplicateA'] + filtered_veg6_data['RDReplicateB']
filtered_veg6_data['AD'] = filtered_veg6_data['ADReplicateA'] + filtered_veg6_data['ADReplicateB']

#calculate frequency of aggregated read counts
filtered_veg6_data['freqProp'] = filtered_veg6_data['AD'] / filtered_veg6_data['DP']


#lineage: set to 0 then assign each unique lineage to an unique integer
filtered_veg6_data['lineage_factor'] = 0
j = 0
for lineage in filtered_veg6_data.lineage.unique():
    filtered_veg6_data.loc[filtered_veg6_data.lineage == lineage,['lineage_factor']] = j
    j += 1

#species: Set ACMV = 0 and EACMCV = 1
filtered_veg6_data['species'] = 0
filtered_veg6_data.loc[filtered_veg6_data.chrom == 'EACMCV DNA-A',['species']] = 1
filtered_veg6_data.loc[filtered_veg6_data.chrom == 'EACMCV DNA-B',['species']] = 1

#segment: Set DNA-A = 0 and DNA-B = 1
filtered_veg6_data['segment'] = 0
filtered_veg6_data.loc[filtered_veg6_data.chrom == 'ACMV DNA-B',['segment']] = 1
filtered_veg6_data.loc[filtered_veg6_data.chrom == 'EACMCV DNA-B',['segment']] = 1

#SEGs treatment: Virus Only:0, SEGS-1:1,SEGS-2:2
filtered_veg6_data['SEGs_treatment'] = 0
filtered_veg6_data.loc[filtered_veg6_data.segsTreatment2 == 'SEGS-1','SEGs_treatment'] = 1
filtered_veg6_data.loc[filtered_veg6_data.segsTreatment2 == 'SEGS-2','SEGs_treatment'] = 2

In [8]:
veg6_options['frequency'] = 'freqPropMeanNoNA'

In [28]:
veg6_avg = average_groups(filtered_veg6_data,['species','segment','pos','alt','passage','ref','lineage_factor','SEGs_treatment'])

veg6_avg.reset_index(inplace=True)
veg6_avg['pos'] = veg6_avg.pos.astype(int)
veg6_pi_df = get_group_pis(veg6_avg,
                     options=veg6_options,
                     group_by=['species','segment','passage','lineage_factor','SEGs_treatment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site_data[frequency] = [x/dp for x in site_data['AD']]


In [29]:
lm = sf.ols('pi ~ passage + C(SEGs_treatment) + (C(species)/C(segment)) + C(lineage_factor)',data=veg6_pi_df).fit(cov_type='HC1')
print("Least squares summary:")
print(lm.summary())

print("\nAnova table:")
veg6_table = sm.stats.anova_lm(lm)
print(veg6_table)


Least squares summary:
                            OLS Regression Results                            
Dep. Variable:                     pi   R-squared:                       0.179
Model:                            OLS   Adj. R-squared:                  0.139
Method:                 Least Squares   F-statistic:                     3.077
Date:                Mon, 10 Aug 2020   Prob (F-statistic):           0.000114
Time:                        12:14:16   Log-Likelihood:                -851.96
No. Observations:                 323   AIC:                             1736.
Df Residuals:                     307   BIC:                             1796.
Df Model:                          15                                         
Covariance Type:                  HC1                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------



In [30]:
veg6_table.to_csv("Veg6_full.csv")

In [40]:
veg6_avg = average_groups(filtered_veg6_data,['species','segment','pos','alt','passage','ref'])

veg6_avg.reset_index(inplace=True)
veg6_avg['pos'] = veg6_avg.pos.astype(int)
veg6_pi_df = get_group_pis(veg6_avg,
                     options=veg6_options,
                     group_by=['species','segment','passage'])

lm = sf.ols('pi ~ passage + C(species)/C(segment)',data=veg6_pi_df).fit(cov_type='HC1')
print("Least squares summary:")
print(lm.summary())

print("\nAnova table:")
veg6_table = sm.stats.anova_lm(lm)
print(veg6_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site_data[frequency] = [x/dp for x in site_data['AD']]


Least squares summary:
                            OLS Regression Results                            
Dep. Variable:                     pi   R-squared:                       0.576
Model:                            OLS   Adj. R-squared:                  0.334
Method:                 Least Squares   F-statistic:                     3.162
Date:                Mon, 10 Aug 2020   Prob (F-statistic):             0.0878
Time:                        12:36:06   Log-Likelihood:                -42.683
No. Observations:                  12   AIC:                             95.37
Df Residuals:                       7   BIC:                             97.79
Df Model:                           4                                         
Covariance Type:                  HC1                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------

  "anyway, n=%i" % int(n))


In [32]:
veg6_table.to_csv("Veg6_minimal.csv")

## Veg2

In [33]:
data = pd.read_csv("variant-calls_temperature-expt_2020-04-07_tables.csv", header=0, index_col=0)
veg2_options = get_veg_prop_args()

data_no_32 = data.loc[data.temperature != 32]
data_no_32_both_present = data_no_32.dropna(subset=['ADReplicateA','ADReplicateB'])
filtered_data = pd.concat([data_no_32_both_present.loc[data_no_32_both_present.freqPropReplicateA >= 0.3],
                           data_no_32_both_present.loc[data_no_32_both_present.freqPropReplicateB >= 0.03]]).drop_duplicates()

filtered_data['DP'] = filtered_data['ADReplicateA'] + filtered_data['ADReplicateB'] + filtered_data['RDReplicateA'] + filtered_data['RDReplicateB']
filtered_data['AD'] = filtered_data['ADReplicateA'] + filtered_data['ADReplicateB']

filtered_data['lineage_factor'] = 0

j = 0
for lineage in filtered_data.lineage.unique():
    filtered_data.loc[filtered_data.lineage == lineage,['lineage_factor']] = j
    j += 1

#species: Set ACMV = 0 and EACMCV = 1
filtered_data['species'] = 0
filtered_data.loc[filtered_data.chrom == 'EACMCV DNA-A',['species']] = 1
filtered_data.loc[filtered_data.chrom == 'EACMCV DNA-B',['species']] = 1

#segment: Set DNA-A = 0 and DNA-B = 1
filtered_data['segment'] = 0
filtered_data.loc[filtered_data.chrom == 'ACMV DNA-B',['segment']] = 1
filtered_data.loc[filtered_data.chrom == 'EACMCV DNA-B',['segment']] = 1

  interactivity=interactivity, compiler=compiler, result=result)


In [34]:
veg2_options['frequency'] = 'freqPropMeanNoNA'

In [35]:
veg2_avg = average_groups(filtered_data,['species','segment','pos','alt','passage','ref','lineage_factor','temperature'])

veg2_avg.reset_index(inplace=True)
veg2_avg['pos'] = veg2_avg.pos.astype(int)

veg2_pi_df = get_group_pis(veg2_avg,
                     options=veg2_options,
                     group_by=['species','segment','passage','lineage_factor','temperature'])

In [36]:
lm = sf.ols('pi ~ passage + C(temperature) + (C(species)/C(segment)) + C(lineage_factor)',data=veg2_pi_df).fit(cov_type='HC1')
print("Least squares summary:")
print(lm.summary())

print("\nAnova table:")
veg2_table = sm.stats.anova_lm(lm)
print(veg2_table)

Least squares summary:
                            OLS Regression Results                            
Dep. Variable:                     pi   R-squared:                       0.400
Model:                            OLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     5.416
Date:                Mon, 10 Aug 2020   Prob (F-statistic):           2.38e-05
Time:                        12:15:40   Log-Likelihood:                -123.36
No. Observations:                  67   AIC:                             266.7
Df Residuals:                      57   BIC:                             288.8
Df Model:                           9                                         
Covariance Type:                  HC1                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------



In [37]:
veg2_table.to_csv("Veg2_full.csv")

In [38]:
veg2_avg = average_groups(filtered_data,['species','segment','pos','alt','passage','ref'])

veg2_avg.reset_index(inplace=True)
veg2_avg['pos'] = veg2_avg.pos.astype(int)

veg2_pi_df = get_group_pis(veg2_avg,
                     options=veg2_options,
                     group_by=['species','segment','passage'])

lm = sf.ols('pi ~ passage + (C(species)/C(segment))',data=veg2_pi_df).fit(cov_type='HC1')
print("Least squares summary:")
print(lm.summary())

print("\nAnova table:")
veg2_table = sm.stats.anova_lm(lm)
print(veg2_table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  site_data[frequency] = [x/dp for x in site_data['AD']]


Least squares summary:
                            OLS Regression Results                            
Dep. Variable:                     pi   R-squared:                       0.834
Model:                            OLS   Adj. R-squared:                  0.740
Method:                 Least Squares   F-statistic:                     16.42
Date:                Mon, 10 Aug 2020   Prob (F-statistic):            0.00115
Time:                        12:15:55   Log-Likelihood:                -31.674
No. Observations:                  12   AIC:                             73.35
Df Residuals:                       7   BIC:                             75.77
Df Model:                           4                                         
Covariance Type:                  HC1                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------

  "anyway, n=%i" % int(n))


In [39]:
veg2_table.to_csv("Veg2_minimal.csv")