<a href="https://colab.research.google.com/github/emma-kar/osteosarcoma/blob/main/CLEAN_COPY_Final_Osteosarcoma_SEER_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import packages to be used
import pandas as pd
import numpy as np
!pip install plotly_express
import plotly_express as px
import plotly.graph_objects as go
from scipy.stats import fisher_exact
import matplotlib.pyplot as plt


Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly_express
Successfully installed plotly_express-0.4.1


In [None]:
#connect to google drive in order to open up files on drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#First we open the file that will be used. This file has been extracted from SEER.
df = pd.read_csv("/content/drive/MyDrive/SP_research/ost_agerates_042823.csv")
df.head(10)

Unnamed: 0,Sex,Age recode with <1 year olds,Year of diagnosis,Age-Adjusted Rate,Count,Population
0,Male and female,00 years,1975-2020,0.0,0,14295114
1,Male and female,00 years,1975,0.0,0,240660
2,Male and female,00 years,1976,0.0,0,240444
3,Male and female,00 years,1977,0.0,0,254975
4,Male and female,00 years,1978,0.0,0,262016
5,Male and female,00 years,1979,0.0,0,273282
6,Male and female,00 years,1980,0.0,0,287392
7,Male and female,00 years,1981,0.0,0,293927
8,Male and female,00 years,1982,0.0,0,296658
9,Male and female,00 years,1983,0.0,0,297450


In [None]:
# This code is meant to clean the data set, removing data points where the Year
# category is '1975-2020,' removing where gender is 'Male and female,'
# and where age is equal to '00 years' and 'unknown.'

df_clean = df[df["Year of diagnosis"]!="1975-2020"].copy()
df_clean['Year of diagnosis']=df_clean['Year of diagnosis'].astype(int)
df_clean = df_clean[(df_clean['Sex']!='Male and female')&
         (df_clean['Age recode with <1 year olds']!='00 years')&(df_clean[
             'Age recode with <1 year olds']!='Unknown')]

In [None]:
# Here we are saving off the cleaned dataframe to an excel that can be used in a different
# worksheet of code.
df_clean.to_excel('df_clean_osteosarcoma2020_new.xlsx', index=False, header=False)

Getting graph for total Osteosarcoma incidence

In [None]:
#This code creates a graph of sum of osteosarcoma incidence across the 46 year span.
fig = px.histogram(df_clean, x = "Age recode with <1 year olds", y = "Count",
                   histfunc = "sum" , width=600, height = 600)
fig.update_layout(
   # title={
       # 'text':"Age in patients with Osteosarcoma <br> of long bones of lower limbs from SEER data 1975-2020",
      #  'y':0.95,
       # 'x':0.5,
       # 'xanchor': 'center',
       # 'yanchor': 'top'},
        xaxis_title="Age of diagnosis", yaxis_title="Number of Tumors",
        font=dict(
        family="arial, monospace",
        size=12))

In [None]:
# This makes the graph a higher resolution image for saving.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_Osteosarcoma_total_notitle_2020',
    'height': 500,
    'width': 700,
    'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

Graphing Males vs Females

In [None]:
# This creates a new dataframe where the data is grouped by sex in each age group
# across all years, which will be used to create a line graph representing incidence
# by sex.
df_MF = df_clean.groupby(['Age recode with <1 year olds',
                          'Sex'])['Count'].sum().reset_index()

In [None]:
# This is a line graph of Males vs Females incidence of osteosarcoma across the
# 46 year span.
fig = px.line(df_MF, x = "Age recode with <1 year olds", y = "Count",
              color = "Sex", width=900, height = 500,
              category_orders = {"Sex":["Male","Female"]})
fig.update_layout(
   # title={
    #    'text':"Age in patients with Osteosarcoma <br> of long bones of lower limbs from SEER data 1975-2020",
   #     'y':0.95,
    #    'x':0.5,
    #    'xanchor': 'center',
     #   'yanchor': 'top'},
        xaxis_title="Age of diagnosis", yaxis_title="Number of Tumors",
        font=dict(
        family="arial, monospace",
        size=12))

In [None]:
# This makes the graph a higher resolution image for saving.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_Osteosarcoma_M/F_line_notitle_2020',
    'height': 500,
    'width': 900,
    'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

subcohort analysis of the 40+ age group

In [None]:
# First we create a new data frame where we eliminate the year "1975-2020",
# Sex = Male&Female, Count > 0, and age of "00 years" and "unknown".
df_MF_total = df[df["Year of diagnosis"]!="1975-2020"].copy()
df_MF_total['Year of diagnosis']=df_MF_total['Year of diagnosis'].astype(int)
df_MF_total = df_MF_total[(df_MF_total['Count']>=0)&(df_MF_total['Sex']=='Male and female')&
         (df_MF_total['Age recode with <1 year olds']!='00 years')&
         (df_MF_total['Age recode with <1 year olds']!='Unknown')]

In [None]:
# Next we create a copy of the dataframe that only extracts the data corresponding
# to those 40+ years old. Then we reorder the data in that new dataframe by age
# and year, reporting the total number of osteosarcoma cases for each.
df_40plus1 = df_MF_total.copy()
df_40plus1 = df_40plus1[(df_40plus1['Age recode with <1 year olds'].isin(
    ['40-44 years', '45-49 years', '50-54 years',
       '55-59 years', '60-64 years', '65-69 years', '70-74 years',
       '75-79 years', '80-84 years', '85+ years']))]
df_40plus1 = df_40plus1.groupby(['Year of diagnosis','Age recode with <1 year olds'])['Count'].sum().reset_index()

In [None]:
# Now we create a dataframe that extracts the age group with the maximum each year
# and then pulls information regarding the incidence of osteosarcoma and year from
# that original dataframe.
df_maxcount = df_40plus1.groupby(['Year of diagnosis'])['Count'].max().reset_index()
df_maxage = df_maxcount.merge(df_40plus1, on=['Year of diagnosis','Count'], how = 'left')

In [None]:
# Since there are several years where there are more than one max, we create a
# new dataframe that only extracts the years where there was a unique maximum.
# Then we reset the count to '1'for each occurence of a unique maximum, which will help
# when it comes time to tallying up the total number of unique maximums.
df_singlemax = df_maxage.groupby('Year of diagnosis')['Age recode with <1 year olds'].nunique().reset_index()
df_singlemax = df_singlemax[df_singlemax['Age recode with <1 year olds']==1]

In [None]:
# This line of code creates a histogram of the subcohort analysis that tallies
# the number of times each age group over in the 40+ years range held a unique
# maximum.
figmax = px.histogram(df_maxage[df_maxage['Year of diagnosis'].isin(df_singlemax[
    'Year of diagnosis'].unique())], width = 600, height = 500,
    x = 'Age recode with <1 year olds', category_orders =
    {'Age recode with <1 year olds':['40-44 years', '45-49 years', '50-54 years',
       '55-59 years', '60-64 years', '65-69 years', '70-74 years',
       '75-79 years', '80-84 years', '85+ years']} )
figmax.update_layout(
   # title={
    #   'text':"Frequency of Unique Maximums of <br> Osteosarcoma of the long bones of the lower limbs for <br> 40+ years old in SEER data 1975-2020",
    #    'y':0.97,
    #    'x':0.5,
    #    'xanchor': 'center',
    #    'yanchor': 'top'},
    xaxis_title="Age of diagnosis",
    yaxis_title="Frequency of Unique Maximums",
    font=dict(
        family="arial, monospace",
        size=12
    ))

In [None]:
# This configuration saves image in higher resolution.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_40plus_truemax_2020',
    'height': 500,
    'width': 700,
    'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}
figmax.show(config=config)

Create 3D graphs

In [None]:
#import data file to be used
#df = pd.read_excel('/content/drive/MyDrive/SP_research/df_clean_osteosarcoma2020_new.xlsx')

In [None]:
#set the order for the age groups in the data set
age_order = pd.api.types.CategoricalDtype(['01-04 years', '05-09 years', '10-14 years', '15-19 years',
       '20-24 years', '25-29 years', '30-34 years', '35-39 years',
       '40-44 years', '45-49 years', '50-54 years', '55-59 years',
       '60-64 years', '65-69 years', '70-74 years', '75-79 years',
       '80-84 years', '85+ years'],
       ordered=True)

df_clean['Age recode with <1 year olds'] = df_clean['Age recode with <1 year olds'].astype(age_order)

In [None]:
# Here we create two new data sets, the first which helps us to eliminate areas
# in the original dataframe where a count of '0' was originally classified at 'NaN'
# and the second which will help us order our data set by age and year of diagnosis.
# Our final step is to combine the two dataframes to result in a dataframe which includes
# the count of '0' and likewise orders the data correctly.
df_consolidated = df_clean.groupby(['Year of diagnosis', 'Age recode with <1 year olds'])['Count'].sum().reset_index()
df_w_zeros = pd.DataFrame.from_records([ [x, y] for x in df_clean['Age recode with <1 year olds'].unique()
 for y in df_clean['Year of diagnosis'].unique() ]).rename(columns={0: 'Age recode with <1 year olds', 1: "Year of diagnosis"})
df_w_zeros = df_w_zeros.merge(df_consolidated, on = ['Age recode with <1 year olds', "Year of diagnosis"],
                               how='left').sort_values(['Age recode with <1 year olds', "Year of diagnosis"])
df_w_zeros['Age recode with <1 year olds'] = df_w_zeros['Age recode with <1 year olds'].astype(age_order)

In [None]:
# Here we extract the 'Count' values from a DataFrame df_w_zeros into a list my_list.
# We then create sublists of my_list, each containing a consecutive sequence of
#'Count' values corresponding to a unique year in the DataFrame's 'Year' column.
#These sublists are stored in z_data.
my_list = list(df_w_zeros['Count'])
z_data = []
for i in range(0, df_w_zeros.shape[0], df_w_zeros['Year of diagnosis'].nunique()):
    z_data.append((my_list[i : i + df_w_zeros['Year of diagnosis'].nunique()]))

In [None]:
# Here we create our first 3D surface plot where the x axis is the year, y axis
# is the age group, and z axis is the Count (referencing the z_data set we made
# in the code above).
x, y  = df_w_zeros['Year of diagnosis'].unique(), df_w_zeros['Age recode with <1 year olds'].unique()
trace = go.Surface(x=x, y=y, z=z_data, colorscale='plotly3')
data = [trace]

x_eye = 2.5
y_eye = 2.5
z_eye = 2.0

fig = go.Figure(data=data)
fig.update_layout(height = 800, width=1600,
                           scene_camera_eye=dict(x=x_eye, y=y_eye, z=z_eye),
         updatemenus=[dict(type='buttons',
                  showactive=False,
                  y=1,
                  x=0.8,
                  xanchor='left',
                  yanchor='bottom',
                  pad=dict(t=45, r=10),
                  buttons=[dict(label='Play',
                                 method='animate',
                                 args=[None, dict(frame=dict(duration=5, redraw=True),
                                                             transition=dict(duration=1),
                                                             fromcurrent=True,
                                                             mode='immediate'
                                                            )]
                                            )
                                      ]
                              )
                        ]
).update_scenes(aspectratio_x = 2.5, aspectratio_y = 2.5, aspectratio_z = 1)

fig.update_layout(#title={
       #'text':"Incidence of Osteosarcoma <br> of long bones of lower limbs <br> from SEER data 1975-2019",
       # 'y':0.78,
       # 'x':0.5,
       #'xanchor': 'center',
     #  'yanchor': 'top'},
        scene = dict(
                    xaxis_title='Year (from 1975-2020)',
                    yaxis_title='Age at Diagnosis',
                    zaxis_title='Incidence'),
                    width=1200,margin=dict(r=30, b=10, l=10, t=10))
fig.show()

In [None]:
#This line of code reformats the graph so that we can save it at a higher resolution.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_Osteosarcoma_3D_incidence_notitle_2020',
    'height': 1000,
    'width': 1400,
    'scale':2.0 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

In [None]:
# This imports a new data frame which contains information about the rate of osteosarcoma
# incidence in each age group each year, as this set has overall population data as well.
# Points where the age is "<1 years old", year is "1975-2020", and sex is "male and female" are filtered out.
age_adjusted = df.sort_values(['Age recode with <1 year olds', 'Year of diagnosis'])
age_adjusted_both = age_adjusted[age_adjusted['Sex'] == 'Male and female']
age_adjusted_both = age_adjusted_both[age_adjusted_both['Year of diagnosis']!= '1975-2020']
age_adjusted_both = age_adjusted_both[age_adjusted_both['Age recode with <1 year olds'] != 'Unknown']

In [None]:
# Here we order the dataframe by age and then create a new term called "age_rate"
# which gives the calculated rate of osteosarcoma incidence in each age group each year
# utilizing the 'Count' and 'Population' data for each year for the calculation.
age_adjusted_both['Age recode with <1 year olds'] = age_adjusted_both['Age recode with <1 year olds'].astype(age_order)
age_adjusted_both['age_rate'] = (age_adjusted_both['Count'] * 1000000 )/ age_adjusted_both['Population']

In [None]:
# Here we organize the values from the 'age_rate' column of the DataFrame
# age_adjusted_both into a 2D list (z_data), where each sublist represents a
# row of values for a specific range of years. The code assumes that the number
# of rows in the DataFrame is a multiple of the number of unique years in the 'Year of diagnosis' column.
my_list = list(age_adjusted_both['age_rate'])
z_data = []
for i in range(0, age_adjusted_both.shape[0], age_adjusted_both['Year of diagnosis'].nunique()):
    z_data.append((my_list[i : i + age_adjusted_both['Year of diagnosis'].nunique()]))

In [None]:
# Here we create our second 3D surface plot where the x axis is the year, y axis
# is the age group, and z axis is the rate of osteosarcoma incidence in each age group
# each calendar year. The z axis term is graphing the new term we defined above called
# 'age-rate' (and also regerences the z_data set we made in the code above).
x, y  = age_adjusted_both['Year of diagnosis'].unique(), age_adjusted_both['Age recode with <1 year olds'].unique()
trace = go.Surface(x=x, y=y, z=z_data, colorscale='plotly3')
data = [trace]

x_eye = 2.5
y_eye = 2.5
z_eye = 2

fig = go.Figure(data=data)
fig.update_layout(height = 1000, width=1200,
                           scene_camera_eye=dict(x=x_eye, y=y_eye, z=z_eye),
         updatemenus=[dict(type='buttons',
                  showactive=False,
                  y=1,
                  x=0.8,
                  xanchor='left',
                  yanchor='bottom',
                  pad=dict(t=45, r=10),
                  buttons=[dict(label='Play',
                                 method='animate',
                                 args=[None, dict(frame=dict(duration=5, redraw=True),
                                                             transition=dict(duration=1),
                                                             fromcurrent=True,
                                                             mode='immediate'
                                                            )]
                                            )
                                      ]
                              )
                        ]
).update_scenes(aspectratio_x = 2.5, aspectratio_y = 2.5, aspectratio_z = 1)
fig.update_layout(#title={
        #'text':"Incidence Rate of Osteosarcoma <br> of long bones of lower limbs <br> from SEER data 1975-2019",
        #'y':0.78,
       # 'x':0.5,
       # 'xanchor': 'center',
       # 'yanchor': 'top'},
        scene = dict(
                    xaxis_title='Year (from 1975-2020)',
                    yaxis_title='Age at Diagnosis',
                    zaxis_title='Incidence Rate'),
                  width=1200,margin=dict(r=30, b=10, l=10, t=10))
fig.show()

In [None]:
# This reformats the graph so we can save it at a higher resolution.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_Osteosarcoma_3D_IR_notitle_2020',
    'height': 1000,
    'width': 1200,
    'scale':2.0 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

Do we consider deleting these?
Creating a 3D plot that looks at the overall population data across the years


In [None]:
# Here we organize the values from the 'Population' column of the DataFrame
# age_adjusted_both into a 2D list (z_data), where each sublist represents a
# row of values for a specific range of years. The code assumes that each row
# in the DataFrame contains 45 values, and the number of rows is a multiple of
# the number of unique years in the 'Year of diagnosis' column.
my_list = list(age_adjusted_both['Population'])
z_data = []
for i in range(0, age_adjusted_both.shape[0], age_adjusted_both['Year of diagnosis'].nunique()):
    z_data.append((my_list[i : i + 45]))

In [None]:
# Here we create our third 3D surface plot where the x axis is the year, y axis
# is the age group, and z axis is overall population in each age group
# each calendar year. We plotted this graph to see the aging of the 'baby-boomer'
# generation across the calendar years, which may have an impact on the rate of osteosarcoma.
x, y  = age_adjusted_both['Year of diagnosis'].unique(),  age_adjusted_both['Age recode with <1 year olds'].unique()
trace = go.Surface(x=x, y=y, z=z_data, colorscale='plotly3')
data = [trace]

x_eye = 2.5
y_eye = 2.5
z_eye = 2

fig = go.Figure(data=data)
fig.update_layout(scene=dict(zaxis = dict(nticks=4, range=[0,2500000],)))
fig.update_layout(height = 800, width=1600,
                           scene_camera_eye=dict(x=x_eye, y=y_eye, z=z_eye),
         updatemenus=[dict(type='buttons',
                  showactive=False,
                  y=1,
                  x=0.8,
                  xanchor='left',
                  yanchor='bottom',
                  pad=dict(t=45, r=10),
                  buttons=[dict(label='Play',
                                 method='animate',
                                 args=[None, dict(frame=dict(duration=5, redraw=True),
                                                             transition=dict(duration=1),
                                                             fromcurrent=True,
                                                             mode='immediate'
                                                            )]
                                            )
                                      ]
                              )
                        ]
).update_scenes(aspectratio_x = 2.5, aspectratio_y = 2.5, aspectratio_z = 1)


fig.show()

In [None]:
# This reformats the graph to be higher resolution for saving.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'SEER_Osteosarcoma_popdata_2020',
    'height': 1000,
    'width': 1200,
    'scale':2.0 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

Fisher's exact test attempt: comparing if correlation between being >65 years and having osteosarcoma

In [None]:
# This code is running a fisher's exact test on the osteosarcoma data to see if there is a
# correlation between being > 65 years and having osteosarcoma. We do this by creating equal
# sized age groups (35-64 & 65-85+) that span across 20 years each. We then run the fishers
# to see if there is a significant difference in the occurrence of osteosarcoma between these
# two age groups. Our null hypothesis says there is no significant difference between the two.
# However, if we are able to reject this null hypothesis, this will allow us to further conclude
# that there is some sort of underlying peak in the incidence of osteosarcoma in the 65+ age range.
# Since the data spans over 46 calendar years, we will run a fisher's test each calendar year
# and assess for the total number of rejected and accepted null hypotheses.
results_dict = {}
for year in age_adjusted_both['Year of diagnosis'].unique():
    results_dict[year] = {}
#creating a copy of the data frame
    df_35to64 = age_adjusted_both[age_adjusted_both['Year of diagnosis']==year].copy()
    #extracting only data for 35-64 years old for this data frame
    df_35to64 = df_35to64[(df_35to64['Age recode with <1 year olds'].isin(
        ['35-39 years','40-44 years', '45-49 years', '50-54 years',
        '55-59 years','60-64 years']))]
    df_35to64 = df_35to64[df_35to64['Sex']=='Male and female']

    #creating a copy of the data frame
    df_65plus = age_adjusted_both[age_adjusted_both['Year of diagnosis']==year].copy()
    #extracting only data for 65+ years old for this data frame
    df_65plus = df_65plus[(df_65plus['Age recode with <1 year olds'].isin(
        ['65-69 years', '70-74 years',
        '75-79 years', '80-84 years', '85+ years']))]
    df_65plus = df_65plus[df_65plus['Sex']=='Male and female']
    #this is the creation of a contigency table where one group is 35-64 age group and the other is 65+ age group
    #the other contingency variable is the occurence of osteosarcoma vs just the overall population
    contingency_table = [[df_65plus['Count'].sum(), df_35to64['Count'].sum()],[df_65plus['Population'].sum(), df_35to64['Population'].sum()]]
    contingency_table
    #this is to run the fischer's exact on that contigency table.
    odds_ratio, p_value = fisher_exact(contingency_table, alternative='two-sided')
    print(year)
    print("Odds ratio:", odds_ratio)
    print("p-value:", p_value, "\n")
    results_dict[year]['Odds ratio'] = odds_ratio
    results_dict[year]['p-value'] = round(p_value, 5)

# now we display the results from each fisher's test run each calendar year
results_df = pd.DataFrame.from_dict(results_dict, orient='index').reset_index().rename(columns={'index':'Year'})


1975
Odds ratio: 0.8941555486352581
p-value: 1.0 

1976
Odds ratio: 0.0
p-value: 1.0 

1977
Odds ratio: 4.557014784662795
p-value: 0.10104945976047068 

1978
Odds ratio: 2.2563273949429123
p-value: 0.37664026170050685 

1979
Odds ratio: 2.973160907071877
p-value: 0.05687377550638532 

1980
Odds ratio: 3.923782059903415
p-value: 0.07395029314792763 

1981
Odds ratio: 2.9061955102288026
p-value: 0.12229192459025279 

1982
Odds ratio: 2.9049063313186188
p-value: 0.17918137148914165 

1983
Odds ratio: 1.926395980544656
p-value: 0.6084420905175549 

1984
Odds ratio: 2.1655406006080757
p-value: 0.3834884650515897 

1985
Odds ratio: 4.32575447633336
p-value: 0.11155879621028232 

1986
Odds ratio: 2.8785676371297066
p-value: 0.1820077026229253 

1987
Odds ratio: 0.959166961756374
p-value: 1.0 

1988
Odds ratio: 3.8646116815632725
p-value: 0.0767230178483555 

1989
Odds ratio: 0.0
p-value: 0.33813491563019143 

1990
Odds ratio: 1.6749823288322456
p-value: 0.4870284437765008 

1991
Odds ratio: 1

In [None]:
# This graphs the p-value from each fisher's test each calendar year. The red line indicates
# where p=0.05, so values above the line are years where we must accept the null hypothesis
# saying that there is no correlation and values below the red line are where we can
# reject this hypothesis.
fig = px.bar(results_df, x='Year', y='p-value',
            # title="Fisher's Test Comparison of Incidence in Age groups 35-64 years  vs 65+ years",
             text = 'p-value', text_auto = False
        ).add_hline(y=0.05, line_width=3, line_dash="dash", line_color="red"
        ).add_hrect(y0=0.05, y1=1.0, line_width=0, fillcolor="red", opacity=0.1
        ).add_hrect(y0=0.05, y1=0.0, line_width=0, fillcolor="green", opacity=0.2)

fig.update_layout(title_x = 0.5, xaxis_title= "Year (1975-2020)")

In [None]:
# This reformats the graph to be higher resolution.
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'Fischers_pvalue_notitle_2020',
    'height': 500,
    'width': 1200,
    'scale':2.0 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)

ODDS RATIO GRAPH


In [None]:
# This set of code extracts the odds ratio from each Fisher's test and creates a
# graph plotting the odds ratio by each calendar year. The red dashed line is a
# regression line which shows the overall trend of the odds ratio as the calendar
# years progress. The odds ratio indicates the likelihood of there being a relationship
# between the incidence of osteosarcoma and the age group 65+, so the greater the odds ratio
# the more likely there is a relationship.

#convert from string to numeric values
results_df['Year'] = pd.to_numeric(results_df['Year'], errors='coerce')
results_df['Odds ratio'] = pd.to_numeric(results_df['Odds ratio'], errors='coerce')

fig = px.scatter(results_df, x='Year', y='Odds ratio',
                # title="Odds ratio from Fishers test across the years (between 35-64 & 65+)"
                 )

# Calculate regression line parameters
z, p = np.polyfit(results_df['Year'], results_df['Odds ratio'], 1)

# Add regression line as a Scatter trace
reg_trace = go.Scatter(x=results_df['Year'], y=z*results_df['Year']+p,
                       mode='lines', line=dict(width=3, dash="dash", color="red"))
reg_trace.name = "regression line"
fig.add_trace(reg_trace)

# Add shaded rectangle
fig.add_shape(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=0.05, y1=1.0,
              fillcolor="red", opacity=0.1, line_width=0)

# set the x-axis tick labels
labels = results_df['Year'].tolist()
fig.update_xaxes(tickvals=results_df['Year'], ticktext=labels, tickangle=45)

fig.update_layout(title_x = 0.5, xaxis_title= "Year (1975-2020)", )
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)
fig.show()

In [None]:
#This is reformatting the graph to save in h
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'Fischers_oddsratio_notitle_2020',
    'height': 500,
    'width': 1200,
    'scale':2.0 # Multiply title/legend/axis/canvas sizes by this factor
  }
}


fig.show(config=config)