In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
import matplotlib.ticker as mticker

In [5]:
# Loading and preprocessing data
df = pd.read_csv('C:\Users\gabri\Dropbox\PC\Documents\GitHub\lung-cancer-risk-model\lung-cancer-risk-model\lung_cancer_model.ipynb')

# Dropping missing values (if any)
df.dropna(inplace=True)

# Map categorical values
df['GENDER'] = df['GENDER'].map({'M': 0, 'F': 1})
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'YES': 1, 'NO': 0})

# Separating features and target variable
X = df.drop(columns=['LUNG_CANCER'])
y = df['LUNG_CANCER']   # Convert the target variable to binary

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

FileNotFoundError: [Errno 2] No such file or directory: 'lung-cancer-dataset.csv'

In [None]:
# Fitting the model
model = BernoulliNB()
model.fit(X_train, y_train)

In [None]:
# Evaluating the model
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

# Cross-validation
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validaded Accuracy: ", np.mean(scores))

In [None]:
# Handle outliers through IQR
Q1 = df.quantile(0.25)    # Q1 is the first quartile (25º percentile) for each column
Q3 = df.quantile(0.75)    # Q3 is third quartile (75º percentile) for each column
IQR = Q3 - Q1             # IQR is the Interquartile range which is the difference between the third and first quartiles (the middle 50% of the data)

# Filter outliers
# The condidion check if there's value in the roll < Q1 - 1.5 * IQR or > Q3 + 1.5 * IQR and ~ inverts the selection and keeps rolls without outliers
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
df.head()

In [None]:
# prompt: Usando o DataFrame df: a bar graph correlating AGE and GENDER, you bust separate AGE in groups with an interval equal to 5

import pandas as pd
import matplotlib.pyplot as plt

# Create age groups with an interval of 5
df['Age_Group'] = pd.cut(df['AGE'], range(35, 90, 5))

# Group data by age group and gender and count the number of people in each group
age_gender_counts = df.groupby(['Age_Group', 'GENDER'])['GENDER'].count().unstack()

# Create a bar plot
age_gender_counts.plot(kind='bar', figsize=(12, 6))
plt.title('Correlation between Age and Gender')
plt.xlabel('Age Group')
plt.ylabel('Number of People')
plt.legend(['Female', 'Male'])
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
# Exclude GENDER, AGE, and LUNG_CANCER
attributes = [col for col in df.columns if col not in ['GENDER', 'AGE', 'LUNG_CANCER']]

# Create a figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Loop through each attribute
for i, attribute in enumerate(attributes):
  # Calculate the percentage of 'YES' and 'NO' for the attribute
  yes_percentage = (df[df[attribute] == 2].shape[0] / df.shape[0]) * 100
  no_percentage = (df[df[attribute] == 1].shape[0] / df.shape[0]) * 100

  # Create a bar graph for the attribute
  ax.bar(i * 2, yes_percentage, color='blue', label='YES' if i == 0 else None)
  ax.bar(i * 2 + 1, no_percentage, color='orange', label='NO' if i == 0 else None)


ax.set_xlabel('Attributes')
ax.set_ylabel('Percentage of Instances')
ax.set_title('Attribute Distribution')

# Set X-axis ticks and labels
ax.set_xticks(np.arange(0, len(attributes) * 2, 2))
ax.set_xticklabels(attributes, rotation=90)

# Add legend
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# @title Correlation Matrix of Lung Cancer Symptoms

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your data is in a pandas DataFrame called 'df'
symptoms = df[['COUGHING', 'SHORTNESS OF BREATH', 'CHEST PAIN', 'WHEEZING', 'FATIGUE ']]
correlation_matrix = symptoms.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
_ = plt.title('Correlation Matrix of Lung Cancer Symptoms')

In [None]:
# @title Age Distribution by Lung Cancer Diagnosis

import matplotlib.pyplot as plt

age_lung_cancer_yes = df[df['LUNG_CANCER'] == 1]['AGE']
age_lung_cancer_no = df[df['LUNG_CANCER'] == 0]['AGE']  # Assuming 0 represents no lung cancer

plt.hist([age_lung_cancer_yes, age_lung_cancer_no], bins=10, label=['Lung Cancer', 'No Lung Cancer'], color=['light red', 'blue'])
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution by Lung Cancer Diagnosis')
_ = plt.legend()