# **Data 200 - Final Project**


Author:  Aryan Jain, Micah Billington, Rupesh Rangwani, Devesh Talreja

Date : 8th April, 2024

## **Summary Of Contents**

### This Final Project contains the following sections :
1. Introduction
2. Data Retrieval & Cleaning
3. Data Preprocessing
4. Exploratory Data Analysis (EDA)
5. Model Application
6. Inference & Prediction
7. Conclusion

# 1. Introduction
<img src="Stack_Overflow.png" width="400" align="centre">

- The full notebook and datasets can be found on GitHub: https://github.com/TrueCodee/Final-Project

# 2. Data Retrieval & Data Cleaning

### 2.1 Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
#!pip install statsmodels

### 2.2    Data Retrieval & Data Cleaning

In [None]:
# Import Data
df = pd.read_csv('survey_results_public.csv')

# Convert 'YearsCode' and 'YearsCodePro' to numeric, handling special cases
df['YearsCode'] = pd.to_numeric(df['YearsCode'], errors='coerce', downcast='integer')
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce', downcast='integer')

# Categorize 'YearsCodePro' into experience levels
def categorize_experience(years):
    if pd.isna(years):
        return 'Unknown'
    elif years <= 2:
        return 'Novice'
    elif years <= 5:
        return 'Intermediate'
    elif years <= 10:
        return 'Experienced'
    else:
        return 'Veteran'

df['ExperienceLevel'] = df['YearsCodePro'].apply(categorize_experience)

# Create binary indicators for programming languages of interest
languages_of_interest = ['Python', 'JavaScript', 'R', 'HTML/CSS', 'SQL', 'Java', 'C#', 'TypeScript', 'C', 'C++']
for language in languages_of_interest:
    # Escape special characters for regular expressions
    language_escaped = re.escape(language)
    df[language] = df['LanguageHaveWorkedWith'].str.contains(r'(?i)\b' + language_escaped + r'\b', na=False)


Age Group Analysis: Calculate Language Usage by Age Group

In [None]:
# Calculate the percentage of respondents in each age group who use Python and SQL
languages_of_interest_ds = ['Python','R','SQL']  # Adjusted list for data science relevant languages

age_language_usage = df.groupby('Age')[languages_of_interest_ds].mean().reset_index()

# Preparing the data for visualization (long format)
age_language_usage_long = pd.melt(age_language_usage, id_vars=['Age'], value_vars=languages_of_interest_ds, 
                                  var_name='Language', value_name='Usage')

Visualize Language Usage by Age Group

In [None]:
sns.set(style="whitegrid", palette="pastel")
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create the barplot
chart = sns.barplot(
    x='Age', 
    y='Usage', 
    hue='Language', 
    data=age_language_usage_long,
    errorbar=None
)

# Customize the visual elements
chart.set_title('Usage of Python and SQL by Age Group', fontsize=16)
chart.set_ylabel('Percentage of Respondents', fontsize=12)
chart.set_xlabel('Age Group', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.legend(title='Programming Language', fontsize=10)

# Adding data labels on top of the bars
for p in chart.patches:
    # Get the height of the bar
    height = p.get_height()
    # If height is 0, we don't want to display the label
    if height > 0:
        chart.annotate(f'{height:.1%}', 
                       (p.get_x() + p.get_width() / 2., height), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 9), 
                       textcoords = 'offset points', fontsize=9)

plt.tight_layout()
plt.show()