In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visual style for plots
sns.set(style="whitegrid")

In [None]:
# Load the dataset
# Replace 'tuberculosis.csv' with the actual file path to your dataset
data = pd.read_csv('tuberculosis.csv')

# Display the first few rows of the dataset
print(data.head())

In [None]:
# Data Cleaning
# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

In [None]:
# Drop rows with missing 'Numeric Value', 'Low', or 'High'
data = data.dropna(subset=['Numeric', 'Low', 'High'])

In [None]:
# Check for missing values again
print("Missing values after cleaning:")
print(data.isnull().sum())

In [None]:
# Convert YEAR to datetime format for time series analysis
data['YEAR'] = pd.to_datetime(data['YEAR (DISPLAY)'], format='%Y')

In [None]:
# Exploratory Data Analysis (EDA)
# Descriptive statistics for Numeric Value
print("Descriptive statistics for TB incidence:")
print(data['Numeric'].describe())

In [None]:
# Group by year and calculate average TB incidence
yearly_data = data.groupby('YEAR (DISPLAY)').agg(
    average_incidence=('Numeric', 'mean'),
    total_cases=('Numeric', 'sum'),
    low_value=('Low', 'mean'),
    high_value=('High', 'mean')
).reset_index()

In [None]:
# Calculate percentage change in TB incidence year-over-year
yearly_data['pct_change'] = yearly_data['average_incidence'].pct_change() * 100

# Visualization 1: Line plot of TB incidence over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=yearly_data, x='YEAR (DISPLAY)', y='average_incidence', marker='o')
plt.title('Average TB Incidence Over Time')
plt.xlabel('Year')
plt.ylabel('Average TB Incidence')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Visualization 2: Bar Chart of TB Incidence by Year
plt.figure(figsize=(12, 6))
sns.barplot(data=yearly_data, x='YEAR (DISPLAY)', y='average_incidence', palette='viridis')
plt.title('Average TB Incidence in Kenya by Year')
plt.xlabel('Year')
plt.ylabel('Average TB Incidence')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Visualization 3: Histogram of TB Incidence Values
plt.figure(figsize=(12, 6))
sns.histplot(data['Numeric'], bins=20, kde=True, color='blue')
plt.title('Distribution of TB Incidence Values')
plt.xlabel('TB Incidence')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Visualization 4: Box Plot of TB Incidence Values by Year
plt.figure(figsize=(12, 6))
sns.boxplot(data=data, x='YEAR (DISPLAY)', y='Numeric', palette='Set2')
plt.title('Distribution of TB Incidence Values in Kenya by Year')
plt.xlabel('Year')
plt.ylabel('TB Incidence')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Additional Analysis: Error Bars using Low and High values if available
if 'Low' in data.columns and 'High' in data.columns:
    # Group by year and calculate mean and confidence intervals
    ci_data = data.groupby('YEAR (DISPLAY)').agg(
        mean_value=('Numeric', 'mean'),
        low_value=('Low', 'mean'),
        high_value=('High', 'mean')
    ).reset_index()

    plt.figure(figsize=(12, 6))
    plt.errorbar(ci_data['YEAR (DISPLAY)'], ci_data['mean_value'], 
                 yerr=[ci_data['mean_value'] - ci_data['low_value'], 
                        ci_data['high_value'] - ci_data['mean_value']],
                 fmt='o', capsize=5)
    plt.title('TB Incidence Over Time with Confidence Intervals')
    plt.xlabel('Year')
    plt.ylabel('TB Incidence')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Low and High values are not available for error bars.")

In [None]:
# Correlation Analysis between Numeric Value, Low, and High
correlation_matrix = data[['Numeric', 'Low', 'High']].corr()
print("Correlation matrix:")
print(correlation_matrix)

In [None]:
# Visualization 5: Heatmap of Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap between TB Incidence Metrics')
plt.tight_layout()
plt.show()

In [None]:
# Visualization 6: Pie Chart for Proportion of TB Cases by Region (if applicable)
#if 'REGION (DISPLAY)' in data.columns:
 #   region_data = data['REGION (DISPLAY)'].value_counts()
    
  #  plt.figure(figsize=(8, 8))
   # plt.pie(region_data, labels=region_data.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
   # plt.title('Proportion of TB Cases by Region in Kenya')
   # plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
  #  plt.tight_layout()
  #  plt.show()
#else:
 #   print("No regional data available for pie chart visualization.")