In [1]:
import re
import os
import time
import requests
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# Configuration
MAX_RETRIES = 10
RETRY_DELAY = 30  # seconds
SURVEY_YEARS = list(range(2011, 2026))  # 2011 to 2025

print("Starting Stack Overflow Survey Data Download...")
print(f"Years to download: {min(SURVEY_YEARS)} to {max(SURVEY_YEARS)}")
print(f"Retry configuration: {MAX_RETRIES} max attempts, {RETRY_DELAY}s delay")
print(f"URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{{year}}.zip\n")


Starting Stack Overflow Survey Data Download...
Years to download: 2011 to 2025
Retry configuration: 10 max attempts, 30s delay
URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip



In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
def get_survey_urls(year):
    """
    Generate URL for a given survey year.
    All years use the same datasets ZIP pattern.
    """
    # All years use the same URL pattern
    url = f"https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip"
    return [url]

def fix_headers_for_older_years(df, year):
    """
    Fix headers for years 2011-2016 where the first two rows are headers.
    If the second row says "Response", use only the first row value.
    Otherwise, combine the first two rows.
    """
    if year >= 2011 and year <= 2015:
        if df.shape[0] < 2:
            print(f"  Warning: Not enough rows to fix headers for year {year}")
            return df
        
        # Get the first two rows
        first_row = df.iloc[0].astype(str)
        second_row = df.iloc[1].astype(str)
        
        # Create new column names
        new_columns = []
        for i, (first_val, second_val) in enumerate(zip(first_row, second_row)):
            first_val = first_val.strip()
            second_val = second_val.strip()
            
            # If second row is "Response", just use first row value
            if second_val == "Response":
                new_columns.append(first_val)
            else:
                # If both are the same or second is empty, use first
                if first_val == second_val or second_val == "":
                    new_columns.append(first_val)
                else:
                    # Combine both values - first value as primary
                    new_columns.append(f"{first_val} ({second_val})")
        
        # Set new column names
        df.columns = new_columns
        
        # Drop the first two rows (header rows)
        df = df.iloc[2:].reset_index(drop=True)
        
        print(f"  ✓ Fixed headers for year {year} (combined first two rows, removed 2 header rows)")
    
    return df

def download_file(url, year):
    """
    Download a file (single attempt, no retries).
    Returns the content if successful, None otherwise.
    Handles both CSV and ZIP files.
    """
    try:
        print(f"  Trying URL: {url}")
        response = requests.get(url, timeout=60, stream=True)
        response.raise_for_status()
        
        # Check content type
        content_type = response.headers.get('content-type', '').lower()
        
        # Reject HTML responses (likely error pages)
        if 'html' in content_type and response.status_code == 200:
            # Might be an error page, try next URL pattern
            print(f"  Warning: Received HTML instead of data file, may be wrong URL")
            return None
        
        content = response.content
        # Basic validation: check if content looks reasonable
        if len(content) < 100:
            print(f"  Warning: File too small, may be error page")
            return None
        
        # Check if it's a ZIP file by magic bytes
        is_zip = content[:2] == b'PK'  # ZIP files start with PK
        if is_zip:
            print(f"  ✓ Successfully downloaded {year} as ZIP ({len(content):,} bytes)")
        else:
            print(f"  ✓ Successfully downloaded {year} ({len(content):,} bytes)")
        
        return content
        
    except requests.exceptions.RequestException as e:
        print(f"  ✗ Download error: {str(e)}")
        return None


In [4]:
def validate_url(url):
    """
    Validate if a URL exists without downloading the full content.
    Returns True if the URL is valid and returns proper headers.
    """
    try:
        # Only get headers to check existence
        response = requests.head(url, timeout=10)
        return response.status_code == 200 and 'content-length' in response.headers
    except requests.exceptions.RequestException:
        return False

In [5]:
def download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY, sample_size=None):
    """
    Download and extract survey data for a given year with retry logic.
    Tries multiple URL patterns and handles both CSV and ZIP files.
    Wraps the entire process in retry logic to catch any runtime errors.
    
    Args:
        year: The survey year to download
        max_retries: Maximum number of retry attempts
        delay: Delay between retries in seconds
        sample_size: If provided, only read this many rows from the CSV (for testing/development)
    
    Returns:
        DataFrame if successful, None otherwise.
    """
    print(f"\n{'='*60}")
    print(f"Processing year {year}")
    print(f"{'='*60}")
    
    urls = get_survey_urls(year)
    
    # Outer retry loop for entire download/extract process
    # This will retry the entire process up to max_retries times if a RuntimeError occurs
    for retry_attempt in range(max_retries):
        try:
            # Try each URL pattern
            for url in urls:
                content = download_file(url, year)
                
                if content is None:
                    continue
                
                # Check if content is a ZIP file by magic bytes (ZIP files start with 'PK')
                is_zip = content[:2] == b'PK'
                
                if is_zip:
                    # Try to parse as ZIP
                    try:
                        with ZipFile(BytesIO(content)) as zip_file:
                            # Look for CSV files in the ZIP (exclude macOS metadata)
                            csv_files = [f for f in zip_file.namelist() 
                                       if f.endswith('.csv') and not f.startswith('__MACOSX/')]
                            if csv_files:
                                # Use the first CSV file found
                                csv_file = csv_files[0]
                                print(f"  Found CSV file in ZIP: {csv_file}")
                                with zip_file.open(csv_file) as f:
                                    # For years 2011-2016, read without header to fix manually
                                    read_kwargs = {
                                        'low_memory': False, 
                                        'on_bad_lines': 'skip',
                                        'nrows': sample_size  # Add sample size parameter
                                    }
                                    if year >= 2011 and year <= 2015:
                                        read_kwargs['header'] = None
                                    
                                    try:
                                        df = pd.read_csv(f, encoding='utf-8', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                                    except UnicodeDecodeError:
                                        f.seek(0)
                                        df = pd.read_csv(f, encoding='latin-1', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                            else:
                                print(f"  No CSV files found in ZIP archive")
                    except Exception as e:
                        print(f"  ZIP parsing failed: {str(e)}")
                        continue
                else:
                    # Try to parse as CSV directly
                    # For years 2011-2016, read without header to fix manually
                    read_kwargs = {
                        'low_memory': False, 
                        'on_bad_lines': 'skip',
                        'nrows': sample_size  # Add sample size parameter
                    }
                    if year >= 2011 and year <= 2016:
                        read_kwargs['header'] = None
                    
                    try:
                        df = pd.read_csv(BytesIO(content), encoding='utf-8', **read_kwargs)
                        print(f"  ✓ Successfully loaded {year} as CSV ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                        # Fix headers for older years
                        df = fix_headers_for_older_years(df, year)
                        return df
                    except UnicodeDecodeError:
                        # Try different encoding
                        try:
                            df = pd.read_csv(BytesIO(content), encoding='latin-1', **read_kwargs)
                            print(f"  ✓ Successfully loaded {year} as CSV with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                            # Fix headers for older years
                            df = fix_headers_for_older_years(df, year)
                            return df
                        except Exception as e:
                            print(f"  CSV parsing failed: {str(e)}")
                            continue
                    except Exception as e:
                        print(f"  CSV parsing failed: {str(e)}")
                        continue
            
            # If we get here, the URL failed - this triggers a retry if attempts remain
            if retry_attempt < max_retries - 1:
                print(f"  ✗ Download failed for year {year}")
                print(f"  Retrying entire process (attempt {retry_attempt + 2}/{max_retries}) in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed to download and extract data for year {year} after {max_retries} attempts")
                print(f"  URL attempted: {urls[0]}")
                return None
                
        except RuntimeError as e:
            print(f"  ✗ Runtime error on attempt {retry_attempt + 1}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts due to runtime error")
                return None
        except Exception as e:
            # Catch any other unexpected errors and retry
            print(f"  ✗ Unexpected error on attempt {retry_attempt + 1}: {type(e).__name__}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts")
                return None
    
    return None

In [6]:
# Define sample size for testing (set to None for full dataset)
SAMPLE_SIZE = 1000  # Adjust this value to control how many rows to read from each year

# Download and create dataframes for each year
dataframes = {}

for year in SURVEY_YEARS:
    df = download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY)
    if df is not None:
        # Add a year column to track which year the data is from
        df['SurveyYear'] = year
        dataframes[year] = df
    else:
        print(f"⚠ Skipping year {year} - download failed")

print(f"\n{'='*60}")
print(f"Download Summary")
print(f"{'='*60}")
print(f"Successfully downloaded: {len(dataframes)} out of {len(SURVEY_YEARS)} years")
print(f"Years downloaded: {sorted(dataframes.keys())}")
print(f"Years failed: {[y for y in SURVEY_YEARS if y not in dataframes]}")

# Display info for each dataframe
if dataframes:
    print(f"\n{'='*60}")
    print(f"DataFrame Information")
    print(f"{'='*60}")
    for year, df in sorted(dataframes.items()):
        print(f"Year {year}: {df.shape[0]:,} rows × {df.shape[1]:,} columns")


Processing year 2011
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2011.zip
  ✓ Successfully downloaded 2011 as ZIP (80,173 bytes)
  Found CSV file in ZIP: 2011 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2011 from ZIP with latin-1 encoding (2,815 rows, 65 cols)
  ✓ Fixed headers for year 2011 (combined first two rows, removed 2 header rows)

Processing year 2012
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2012.zip
  ✓ Successfully downloaded 2012 as ZIP (266,621 bytes)
  Found CSV file in ZIP: 2012 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2012 from ZIP with latin-1 encoding (6,245 rows, 75 cols)
  ✓ Fixed headers for year 2012 (combined first two rows, removed 2 header rows)

Processing year 2013
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2013.zip
  ✓ Successfully downloaded 2013 as ZIP (689,493 bytes)
  Found CSV file in ZIP: 

### Create Combined Data Frame

In [7]:
# Create combined dataframe from all years
if dataframes:
    print(f"\n{'='*60}")
    print(f"Creating Combined DataFrame")
    print(f"{'='*60}")
    
    # Ensure all dataframes have unique columns before concatenation
    # Find the union of all columns
    all_columns = set()
    for df in dataframes.values():
        all_columns.update(df.columns)
    all_columns = list(all_columns)

    # Reindex each dataframe to ensure unique columns for concat
    aligned_dfs = []
    for year, df in dataframes.items():
        # Remove duplicate columns if any (can happen on bad CSVs)
        df = df.loc[:,~df.columns.duplicated()]
        aligned_df = df.reindex(columns=all_columns)
        aligned_dfs.append(aligned_df)
    
    combined_df = pd.concat(aligned_dfs, ignore_index=True, sort=False)
    
    print(f"success!")
    print(f"  rows: {combined_df.shape[0]:,}")
    print(f"  columns: {combined_df.shape[1]:,}")
    print(f"  Years: {sorted(combined_df['SurveyYear'].dropna().unique())}")
    
    # Show basic info about the combined dataframe
    print(f"\n{'='*60}")
    print(f"Combined DataFrame Info")
    print(f"{'='*60}")
    print(combined_df.info())
    print(f"{'='*60}")
else:
    print("\n⚠ No dataframes were successfully downloaded. Cannot create combined dataframe.")
    combined_df = None



Creating Combined DataFrame
success!
  rows: 772,599
  columns: 1,087
  Years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772599 entries, 0 to 772598
Columns: 1087 entries, nan (>$150,000) to StackOverflowAdsDistracting
dtypes: float64(106), int64(1), object(980)
memory usage: 6.3+ GB
None


In [8]:
# Access individual year dataframes: dataframes[year]
# Access combined dataframe: combined_df
# Example:
if dataframes:
    print(f"\n{'='*60}")
    print(f"How to Access Your Data")
    print(f"{'='*60}")
    print(f"Individual year dataframes:")
    print(f"  - dataframes[2024]  # Access 2024 data")
    print(f"  - dataframes[2023]  # Access 2023 data")
    print(f"  - etc.")
    print(f"\nCombined dataframe:")
    print(f"  - combined_df  # All years combined")
    print(f"\nAvailable years: {sorted(dataframes.keys())}")
    
    # Quick preview of the combined dataframe
    if combined_df is not None:
        print(f"\n{'='*60}")
        print(f"Combined DataFrame Preview (first 5 rows)")
        print(f"{'='*60}")
        print(combined_df.head())



How to Access Your Data
Individual year dataframes:
  - dataframes[2024]  # Access 2024 data
  - dataframes[2023]  # Access 2023 data
  - etc.

Combined dataframe:
  - combined_df  # All years combined

Available years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Preview (first 5 rows)
  nan (>$150,000)  ExpectedSalary AIToolInterested in Using TrueFalse_1  \
0             NaN             NaN                       NaN         NaN   
1             NaN             NaN                       NaN         NaN   
2             NaN             NaN                       NaN         NaN   
3             NaN             NaN                       NaN         NaN   
4       >$150,000             NaN                       NaN         NaN   

  SOTagsHaveWorkedWith ToolsTechWantToWorkWith OfficeStackSyncAdmired  \
0                  NaN                     NaN                    NaN   
1                  NaN                     NaN  

In [9]:
combined_df.head()

Unnamed: 0.1,"nan (>$150,000)",ExpectedSalary,AIToolInterested in Using,TrueFalse_1,SOTagsHaveWorkedWith,ToolsTechWantToWorkWith,OfficeStackSyncAdmired,NEWCollabToolsDesireNextYear,Knowledge_7,AINextNeither different nor similar,nan (Other tablet),nan (Training & Education: Masters in CS),nan (Most important aspect of new job opportunity: Office location),SexualOrientation,AdsPriorities5,WebframeChoice,ImpSyn,EmbeddedHaveWorkedWith,DatabaseChoice,agree_legacy,OpSys,OfficeStackSyncWantToWorkWith,nan (Training & Education: Mentorship),nan (Excitement About the Company's Products),nan (Go),nan (Current Lang & Tech: R),Do you work remotely?,nan (Why use Stack Overflow: Can't do job without it),BuyNewTool,StackOverflowJobsRecommend,LearnCodeAI,StackOverflowMetaChat,CheckInCode,AdsPriorities1,InfluenceWorkstation,AssessJobOffice,NEWJobHunt,Gender,JobEmailPriorities4,nan (Remote Status),WebframeWantEntry,nan (Future Lang & Tech: PHP),Which languages are you proficient in? (Java),AnnoyingUI,StackOverflowJobListing,AIDangerous,commit_frequency,agree_loveboss,job_satisfaction,StackOverflowBetter,nan (WinRT),Do you have a Stack Overflow Careers 2.0 Profile?,WantWorkPlatform,EquipmentSatisfiedRW,nan (Current Lang & Tech: Python),EmploymentStatus,WantWorkLanguage,AssessBenefits7,WelcomeChange,DifficultCommunication,TechList,BuildvsBuy,TechOppose_5,rep_range,nan (Redis),AINextNo change,ExCoderSkills,nan (Why answer: Help future programmers),Which desktop operating system do you use the most?,AssessBenefits3,nan (Current Lang & Tech: Java),LearnCodeOnline,Frequency_2,ImportantBenefits,open_to_new_job,salary_range,nan (Why use Stack Overflow: Maintain online presence),InTheZone,why_learn_new_tech,CareerSatisfaction,Employment,NewRole,InfluenceRecruitment,remote,nan (Looking for a job),AssessJob9,What Country do you live in?,nan (Most important aspect of new job opportunity: Company reputation),nan (Why try Stack Overflow Careers: Jobs are on Stack Overflow),nan (Employment Status),WebframeAdmired,StackOverflowSatisfaction,WorkExp,nan (TypeScript),nan (Future Lang & Tech: C#),"nan (Link to a Stack Overflow Careers Company Page or other source of more information about the company (videos, articles, etc))",nan (Who do you want to communicate with about a new job opportunity: Manager),StackOverflowJobs,LastInt,NEWDevOpsImpt,Frequency_3,SOTagsHaveEntry,AIChallenges,AIToolCurrently mostly AI,AssessJobProjects,AIModelsAdmired,LanguageAdmired,JobSatPoints_15_TEXT,nan (Current Lang & Tech: Haskell),PlatformHaveEntry,nan (Source control used: CVS),Student,JobSatPoints_14,nan (Why use Stack Overflow: Communicate with others),nan (Current Lang & Tech: CoffeeScript),nan (Future Lang & Tech: Go),AIAgentKnowWrite,nan (C),How likely is it that a recommendation you make will be acted upon?,EthicsResponsible,nan (Netbook),"nan ($41,000 - $75,000)","If your company has a native mobile app, what platforms do you support? (iPhone)",AIHuman,How would you best describe the industry you currently work in?,nan (Office in a Desirable City/Area),InfluenceCommunication,nan (Android),important_sameend,StackOverflowAnswer,nan (Who do you want to communicate with about a new job opportunity: Developer),LearnCodeChoose,important_promotion,employment_status,Select all that apply (Current Lang & Tech: Android),AssessJob4,AIAgentOrchWrite,agree_alcohol,What advertisers do you remember seeing on Stack Overflow? (Open-Ended Response),"You answered you don't have a Careers profile, can you elaborate why?",nan (Hadoop),dev_environment,dogs_vs_cats,OfficeStackWantEntry,"nan (Training & Education: Some college, but no CS degree)",Select all that apply (Why try Stack Overflow Careers: No spam),SO_Actions_7,StackOverflowHelpful,ProblemSolving,nan (Current Lang & Tech: Rust),nan (Source control used: TFS),DevEnvsAdmired,AISearchDevAdmired,AIAgentObsWrite,AIAgentImpactNeutral,"In an average week, how do you spend your time? (Developing new features)",nan (Ask questions to solve problems),programming_ability,LearnedHiring,Q120,SOJobs,nan (C++),SOFriction,nan (Perl),FriendsDevelopers,nan (Meetings),nan (AngularJS),nan (Finance),University,StackOverflowAdsRelevant,BuildingThings,nan (Future Lang & Tech: Haskell),StackOverflowModeration,nan (Industry),SOVisitFreq,Ethnicity,Onboarding,AdsPriorities3,nan (Future Lang & Tech: Matlab),WakeTime,LearnCodeCoursesCert,nan (Learning new skills),Select up to 3 (How can companies improve interview process: More live code),SurveyLength,LanguagesWantEntry,AssessJobCommute,nan (Appealing message traits: Salary information),StackOverflowHasAccount,JobContactPriorities1,nan (Future Lang & Tech: SQL),nan (Other Smart Phone),nan (Current Lang & Tech: iOS),JobEmailPriorities1,nan (Most annoying about job search: The Interview),CollaborateRemote,nan (Technical Support),AssessJobTech,HaveWorkedLanguage,WorkRemote,important_wfh,nan (Opportunity to Use/Learn New Technologies),nan (Future Lang & Tech: Redis),AIToolCurrently partially AI,StackOverflowDevices,nan (C#),"If you make a software product, how does your company make money? (You can choose more than one) (Advertising)",PurchaseWhat,What types of purchases are you involved in? (Hardware),How would you best describe the industry you work in?,UnderstandComputers,TrueFalse_3,DatabaseHaveWorkedWith,InfluenceTechStack,AIInteresting,nan (Source control used: Legacy / Custom),tech_want,AssessJob2,DevEnvWantEntry,MajorUndergrad,What type of project are you developing?,NonDeveloperType,nan (Current Lang & Tech: C++11),Salary,NEWOtherComms,BetterLife,SO_Actions_10,MiscTechAdmired,InterestedAnswers,AdsPriorities4,ClickyKeys,ProgramHobby,nan (Most important aspect of new job opportunity: Equity),JobSatPoints_7,AdsPriorities6,NEWSOSites,AIExplain,nan (Software),nan (Current Lang & Tech: MongoDB),nan (HTML5),OtherPeoplesCode,AdBlocker,JobSatPoints_10,Knowledge_5,nan (Cordova),EthicsReport,nan (Android phone),nan (Source control used: SVN),BlockchainIs,nan (How often are Stack Overflow's answers helpful),Where do you work remotely most of the time?,ImportantHiringCompanies,"nan ($100,001 - $150,000)",nan (Using Stack Exchange),BoringDetails,AdBlockerDisable,AdBlockerReasons,JobSatisfaction,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (I can recommend or request products),nan (How can companies improve interview process: Remote interviews),nan (Recruitment Tools & Services),AssessJob10,JobSecurity,AIResponsible,StackOverflowCommunity,nan (Training & Education: PhD in CS),nan (Why use Stack Overflow: Demonstrate expertise),CommPlatformWantToWorkWith,nan (Most important aspect of new job opportunity: Building something that matters),TabsSpaces,nan (Current Lang & Tech: JavaScript),nan (Kindle),OffOn,nan (Most annoying about job search: Finding interesting job),YearsCodedJob,SeriousWork,nan (iPad),SO_Actions_6,Which best describes the size of your company?,nan (Node.js),nan (Most urgent info about job opportunity: Colleagues),StackOverflowMakeMoney,MilitaryUS,nan (Occupation),visit_frequency,Please rate your job/career satisfaction,ExCoder10Years,ProjectManagement,JobSatPoints_11,AIAgentImpactStrongly agree,LastHireDate,nan (The ads are Informative),Have you visited / Are you aware of Stack Overflow Careers 2.0?,AIAgentChallengesStrongly agree,nan (Current Lang & Tech: Redis),"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?","nan (Positive Organization Structure (not much bureaucracy, helpful management))",OpenSource,How do you prefer to be contacted about job opportunities? (Email),QuestionsInteresting,nan (Java),SalaryType,NEWLearn,TechOppose_9,AssessJobCompensation,DatabaseDesireNextYear,HoursComputer,ConvertedCompYearly,nan (Twitter),AIAgentKnowledge,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",Have you visited / Are you aware of Stack Overflow Careers?,nan (I can buy anything I want without asking anyone),nan (Current Lang & Tech: Scala),nan (Current Lang & Tech: Node.js),PlatformWorkedWith,AISearchWantToWorkWith,Frustration,EquipmentSatisfiedStorage,WebframeHaveWorkedWith,Blockchain,EducationParents,How large is the team that you work on?,nan (Current Lang & Tech: Cordova),SkipMeals,nan (Perception of contact form: Facebook),"nan (High Quality Office Space (amenities, lounge space, free food, etc))",nan (Gender),ToolCountPersonal,DevEnviron,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (Influencer),PlatformWantEntry,JobProfile,PlatformWantToWorkWith,How did you find out about your current job?,ImportantHiringRep,nan (Perception of contact form: LinkedIn),SOTagsAdmired,SOAccount,Age,AINextVery different,MentalHealth,StackOverflowVisit,CodeRevHrs,EmbeddedAdmired,AISent,nan (Includes salary information),nan (Why answer: Demonstrate expertise),EnjoyDebugging,nan (Source control used: Perforce),Which technologies are you excited about? (Node.js),nan (Future Lang & Tech: Node.js),nan (Future Lang & Tech: AngularJS),Race,AssessJobLeaders,"Please rate how important each of the following characteristics of a company/job offer are to you. Please select a MAXIMUM of 3 items as ""Non-Negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (High Base Compensation)",nan (Future Lang & Tech: Arduino),desktop_os,nan (Direct sales to consumers),MobileDeveloperType,nan (Future Lang & Tech: Rust),nan (How can companies improve interview process: Gimme coffee),ImportantHiringGettingThingsDone,Are you currently looking for a job or open to new opportunities?,nan (Future Lang & Tech: Write-In),MetricAssess,How often do you find solutions to your programming problems on Stack Overflow without asking a new question?,nan (Current Lang & Tech: C),JobSeekingStatus,nan (Prefered Source Control),AINextSomewhat similar,NEWCollabToolsWorkedWith,AssessJob3,nan (Purchasing Power),TechEndorse_4,SO_Actions_3,nan (Appealing message traits: Code or projects mentioned),What other departments / roles do you interact with regularly? (System Administrators),age_midpoint,StackOverflowDescribes,TechEndorse_6,TechOppose_11,ExCoderWillNotCode,Were you aware of the Apptivate contest?,AIToolPlan to mostly use AI,nan (other (please specify)),Which US State or Territory do you live in?,nan (Prefered Source Control: write-in),WebFrameWorkedWith,nan (LinkedIn Inmail),nan (Servers),LanguageWorkedWith,occupation,ErgonomicDevices,EquipmentSatisfiedRAM,NEWOffTopic,AIAgentOrchestration,nan (Other Stack Exchange (please specify)),important_variety,country,JobContactPriorities3,nan (Training & Education: On the job),NEWOvertime,PlatformHaveWorkedWith,nan (Perception of contact form: Phone),nan (Phone),ScreenName,Sexuality,nan (Surfing the Internet),HoursPerWeek,nan (Direct sales to companies),What is your current Stack Overflow reputation?,nan (Source control used: I don't use source control),SOAI,DevEnvsWantToWorkWith,DevType,nan (Current Lang & Tech: F#),VCHostingProfessional use,nan (Who do you want to communicate with about a new job opportunity: In-house recruiter),OfficeStackAsyncAdmired,nan (Most urgent info about job opportunity: Office location),important_newtech,nan (Current Lang & Tech: Write-In),AssessJobExp,Country,nan (Check Writer),Currency,nan (PHP),Overpaid,SOTagsWant Entry,Select all that apply (Most urgent info about job opportunity: Salary),nan (Current Lang & Tech: Windows Phone),ImportantHiringAlgorithms,EdLevel,MiscTechWantToWorkWith,ResumePrompted,EthicalImplications,nan (Most annoying about job search: Writing and updating CV),"nan (I influence purchasing decisions, but don't have final approval)",self_identification,ResponseId,nan (Technical support),nan (Designers),AISearchDevHaveWorkedWith,nan (Source control used: DCVS),nan (Why use Stack Overflow: Receive help on personal projects),OfficeStackAsyncHaveWorkedWith,ICorPM,collector,nan (Most important aspect of new job opportunity: Advancement),AssessJobDept,nan (Preferred text editor),WorkLoc,WantWorkDatabase,DevEnvsHaveWorkedWith,AssessJobProduct,WorkChallenge,JobSearchStatus,nan (Future Lang & Tech: Java),ImportantHiringOpenSource,nan (Mentions my code or Stack Overflow activity),SO_Actions_4,Knowledge_8,nan (Future Lang & Tech: JavaScript),Select up to 3 (Appealing message traits: Message is personalized),"In the last 12 months, how much money have you spent on personal technology-related purchases?",nan,nan (Current Lang & Tech: Visual Basic),AIFrustration,JobEmailPriorities2,nan (Why use Stack Overflow: I don't use Stack Overflow),nan (Perception of contact form: Twitter),nan (Tabs or Spaces),ImportantHiringPMExp,AssessJobRole,Select all that apply (Source control used: Git),nan (Commuting),nan (How often contacted by recruiters),SOVisit1st,experience_range,why_stack_overflow,YearsCodedJobPast,AIThreat,AssessBenefits8,nan (Perception of contact form: Stack Overflow Careers),StackOverflowDevStory,AIAgentChallengesSomewhat disagree,education,RemoteWork,nan (Grants / outside fund-raising),nan (Job Satisfaction),JobSeek,nan (How can companies improve interview process: Flexible interview schedule),AIOpen,nan (Consultants),nan (Future Lang & Tech: Sharepoint),LastNewJob,nan (Most important aspect of new job opportunity: Work - Life balance),AgreeDisagree1,Which of the following best describes your occupation?,CommPlatformAdmired,age_range,AssessJob7,JobSatPoints_9,nan (Current Lang & Tech: Ruby),nan (Source control used: Bitkeeper),AssessBenefits4,What operating system do you use the most?,nan (PS4),InfluenceVizTools,nan (Future Lang & Tech: SQL Server),"nan (User Equipment: Monitors, PCs, Laptops)",nan (Other media streaming device),SOTimeSaved,nan (Sales / Marketing),SurveyEase,JobSatPoints_8,JobSat,nan (Current Lang & Tech: Objective-C),nan (Years IT / Programming Experience),nan (Current Lang & Tech: Cassandra),nan (CSS),nan (Mobile app sales),nan (Current Lang & Tech: PHP),nan (Future Lang & Tech: Perl),nan (Future Lang & Tech: Cloud),TrueFalse_2,nan (Other),Industry,nan (I have a discretionary budget at my disposal),nan (Wii),nan (Describes benefits / perks of the work environment),EthicsChoice,LanguageWantToWorkWith,nan (How frequently land on or read Stack Overflow),nan (How many caffeinated beverages per day?),"Including bonus, what is your annual compensation in USD?",important_buildnew,important_companymission,nan (Limited night / weekend work),nan (Approver),What Country or Region do you live in?,CodeRev,TechEndorse_2,nan (Most important aspect of new job opportunity: Company culture),nan (Nook),nan (Why try Stack Overflow Careers: Showcase Stack Overflow activity),StackOverflowParticipate,US_State,AIAgentImpactSomewhat agree,OpSysPersonal use,ConvertedSalary,FormalEducation,DevEnvHaveEntry,nan (Answer questions I know the answer to),DevEnvsChoice,agree_adblocker,UndergradMajor,SO_Actions_15_TEXT,AIAgentExtWrite,Hobby,nan (Current Lang & Tech: SQL Server),women_on_team,SO_Actions_9,AIComplex,HoursOutside,HypotheticalTools3,PlatformChoice,AIAgentChallengesStrongly disagree,JobContactPriorities5,agree_tech,AIToolNot interested in Using,nan (Future Lang & Tech: iOS),AssessJobProfDevel,nan (Identification With the Company/Goals),AssessJob6,SurveyYear,nan (Most urgent info about job opportunity: Tech stack),experience_midpoint,NEWJobHuntResearch,nan (Prefered IDE theme),AINextMuch more integrated,ExCoderNotForMe,Did you participate in the Apptivate contest?,nan (JQuery),SONewContent,nan (What ads? I use an ad blocker),nan (Training & Education: BS in CS),ExCoderBelonged,WebframeDesireNextYear,How many years of IT/Programming experience do you have?,nan (MongoDB),ToolCountWork,QuestionsConfusing,HypotheticalTools5,important_control,RaceEthnicity,AIModelsChoice,EmbeddedWantToWorkWith,nan (Boxee),TechOppose_13,Hobbyist,LearningNewTech,nan (The ads are entertaining),nan (PS3),AuditoryEnvironment,Unnamed: 0,nan (Future Lang & Tech: Python),StackOverflowRecommend,Which technology products do you own? (You can choose more than one) (iPhone),interview_likelihood,AIAgentChange,SOTagsWantToWorkWith,InfluenceCloud,nan (Most important aspect of new job opportunity: Company stage),ConvertedComp,SOComm,nan (Current Lang & Tech: Matlab),NumberMonitors,nan (Lots of Control Over Your Own Work),nan (How important is remote when evaluating new job opportunity?),WorkPayCare,EquipmentSatisfiedCPU,AIAcc,OfficeStackSyncHaveWorkedWith,HypotheticalTools2,What is your gender?,nan (Current Lang & Tech: Wordpress),How many people work for your company?,PronounceGIF,AssessJob1,TechOppose_16,nan (Most important aspect of new job opportunity: Flexible work options),AIModelsHaveEntry,nan (Why try Stack Overflow Careers: Selection of revelant jobs),AssessBenefits11,AISearchDevWantToWorkWith,TechOppose_15,nan (Source control used: Mercurial),occupation_group,StackOverflowConsiderMember,nan (Why answer: No idea),OfficeStackAsyncWantToWorkWith,Check,nan (Room for Growth of Skills/Knowledge),SO_Actions_5,AIToolCurrently Using,InfluenceConsultants,nan (Future Lang & Tech: F#),HopeFiveYears,aliens,nan (Kindle Fire),nan (Blackberry),nan (Haskell),SOHow,SOFindAnswer,CommPlatformWantEntr,InfluenceHardware,AgreeDisagree3,nan (Training & Education: Online Class),nan (Don't know),ImportantHiringCommunication,WebFrameDesireNextYear,nan (Most important aspect of new job opportunity: Company size),nan (Windows Phone),AdsAgreeDisagree3,YearsCode,SO_Actions_16,TechDoc,AIModelsWantToWorkWith,nan (Training & Education: Other),nan (Future Lang & Tech: Ruby),new_job_value,How often do you visit job boards?,Knowledge_2,AssessBenefits6,UpdateCV,"nan ($10,001 - $25,000)",nan (Who do you want to communicate with about a new job opportunity: In-house tech recruiter),Which of the following languages or technologies have you used significantly in the past year? (C),AINextVery similar,nan (Refactoring / code quality),NEWPurchaseResearch,nan (AppleTV),AgentUsesGeneral,NEWCollabToolsWantToWorkWith,Do you enjoy working remotely?,DatabaseHaveEntry,nan (Most urgent info about job opportunity: Benefits),Accessibility,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",salary_midpoint,FizzBuzz,EquipmentSatisfiedMonitors,"Including yourself, how many developers are employed at your company?",nan (Software as a service / recurring billing),Professional,SurveyEasy,AIAgentChallengesSomewhat agree,nan (Build my online reputation),AssessJobFinances,agree_problemsolving,nan (Current Lang & Tech: C#),nan (Current Lang & Tech: Hadoop),TechEndorse,nan (Why answer: Sense of responsibility to developers),DatabaseWantEntry,AIModelsWantEntry,ChangeWorld,AssessBenefits1,"nan ($75,001 - $100,000)",YearsCoding,What best describes your career / job satisfaction?,nan (Dart),JobContactPriorities4,nan (I click on ads that interest me),OperatingSystem,AISelect,ExCoderReturn,InvestTimeTools,OpSysProfessional use,LanguageDesireNextYear,AINextMore integrated,nan (Training & Education: Industry certification),nan (Future Lang & Tech: Salesforce),nan (JavaScript),AssessBenefits2,InfluenceDatabase,nan (Xbox 360),SelfTaughtTypes,nan (HDTV),MiscTechHaveWorkedWith,TechEndorse_1,MiscTechWorkedWith,nan (Appealing message traits: Stack Overflow activity mentioned),nan (Current Lang & Tech: Arduino),Methodology,nan (Future Lang & Tech: Scala),MgrWant,nan (Convenient Commute or Telecommute Options),SO_Actions_1,Frequency_1,Select up to 3 (Most annoying about job search: Finding time),AIModelsHaveWorkedWith,nan (Most important aspect of new job opportunity: Job title),nan (Current Lang & Tech: Dart),SurveyLong,PurchaseHow,ToolsTechHaveWorkedWith,DatabaseWantToWorkWith,AIAgentObserveSecure,job_discovery,AIToolDon't plan to use AI for this task,nan (Xbox One),TechEndorse_9,YearsCodePro,nan (Recommender),nan (PhoneGap),Trans,HaveWorkedFramework,AIAgents,ShipIt,nan (Most important aspect of new job opportunity: Tech stack),TimeAfterBootcamp,nan (Other (please specify)),AdsPriorities7,AIDevHaveWorkedWith,AssessJobIndustry,WantWorkFramework,AssessBenefits5,StackOverflowJobSearch,DatabaseWorkedWith,nan (Appealing message traits: Benefits & Perks),AIToolPlan to partially use AI,LanguagesHaveEntry,AINextMuch less integrated,gender,nan (Other netbook),nan (Other gaming system),TechEndorse_7,job_search_annoyance,How many developers are employed at your company?,nan (Why use Stack Overflow: Love to learn),ImportantHiringEducation,WebframeHaveEntry,nan (Why try Stack Overflow Careers: Other),big_mac_index,HaveWorkedPlatform,MainBranch,nan (Arduino / Raspberry Pi),nan (Current Lang & Tech: Cloud),FrameworkDesireNextYear,PurchaseInfluence,JobEmailPriorities7,AgreeDisagree2,AIAgent_Uses,nan (Purchaser),SOPartFreq,agree_diversity,CompanyType,Age1stCode,Knowledge_4,Select all that apply (Why answer: Help a programmer in need),un_subregion,nan (Most annoying about job search: Taking time off work to interview),nan (Testers / Quality Assurance),nan (Compensation: midpoint),nan (How can companies improve interview process: Introduce me to team),PlatformDesireNextYear,CompFreq,TechOppose_15_TEXT,nan (Stack Overflow Careers Message),IDE,EmploymentAddl,AILearnHow,HypotheticalTools4,AdsAgreeDisagree2,nan (How can companies improve interview process: Better preparation),Select all that apply (Why use Stack Overflow: Help for job),CareerSat,nan (Why answer: I don't answer and I don't want to),CompTotal,TBranch,CommunicationTools,star_wars_vs_star_trek,nan (Future Lang & Tech: Objective-C),nan (C++11),nan (Future Lang & Tech: C),WorkStart,Knowledge_9,AssessBenefits9,nan (Human Resources),CompetePeers,CurrencyDesc,StackOverflowCopiedCode,nan (Appealing message traits: Stack Overflow Company Page),nan (Current Lang & Tech: Go),YearsProgram,StackOverflowWhatDo,nan (40 hour work week),CurrencySymbol,TechOppose_7,nan (Why answer: Self promotion),nan (Future Lang & Tech: Wordpress),AIFuture,nan (Python),AssessJobRemote,SODuration,DeveloperType,ResumeUpdate,important_ownoffice,EducationImportant,"nan (Quality of Workstation (dream machine, 30inch monitors, etc))",nan (High Caliber Team (is everyone else smart/hardworking)),nan (Future Lang & Tech: C++11),nan (SQL),WebframeWantToWorkWith,nan (Fixing bugs),nan (Most important aspect of new job opportunity: Important decisions),OpenSourcer,Respondent,AINextSomewhat different,nan (Current Lang & Tech: Swift),nan (Most urgent info about job opportunity: Job title),nan (Most annoying about job search: Interesting companies rarely respond),WorkPlan,EntTeams,nan (Perception of recruiter contact),TechOppose_2,nan (Age),"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",ImportantHiringTechExp,HackathonReasons,Knowledge_1,TimeAnswering,nan (Describes the team I will work on),How do you use Stack Overflow? (Read other people's questions to solve my problems),ToolsTechAdmired,ProfessionalQuestion,nan (CoffeeScript),Containers,nan (Preferred text editor: write-in),DatabaseAdmired,nan (Most important aspect of new job opportunity: Industry),nan (Regular Mobile Phone),CompanySize,InfluenceInternet,EduOther,agree_nightcode,nan (Xbox),TimeSearching,nan (Future Lang & Tech: Spark),nan (Most annoying about job search: Finding job I'm qualified for),nan (Why use Stack Overflow: To give help),TechEndorse_8,unit_testing,StackOverflowNewQuestion,AdsActions,nan (Future Lang & Tech: MongoDB),CommPlatformHaveEntr,company_size_range,UnitTests,nan (Appealing message traits: Team described),nan (Desktop Operating System: write-in),VersionControlSystem,PlatformAdmired,AIEthics,What is your involvement in purchasing? You can choose more than 1. (Influencer),Have you changed jobs in the last 12 months?,agree_mars,TechEndorse_5,nan (Future Lang & Tech: Visual Basic),AIAgentExternal,AssessBenefits10,AISearchHaveWorkedWith,nan (Describes company culture),tech_do,Extraversion,SO_Dev_Content,AINextLess integrated,NEWCollabToolsHaveWorkedWith,OfficeStackHaveEntry,NEWDevOps,nan (Future Lang & Tech: Windows Phone),nan (Most important aspect of new job opportunity: Health insurance),nan (Most important aspect of new job opportunity: Remote working),JobFactors,StackOverflowCompanyPage,FrameworkWorkedWith,nan (Ruby),ITperson,JobSatPoints_4,nan (Current Lang & Tech: Sharepoint),nan (Open to new job opportunities),Select up to 3 (Most important aspect of new job opportunity: Salary),CommPlatformHaveWorkedWith,nan (No mobile app),nan (Why try Stack Overflow Careers: Jobs site for programmers),nan (Current Lang & Tech: SQL),nan (I'm a Seller),AssessJob5,nan (Country),nan (Future Lang & Tech: Cassandra),AssessJob8,EducationTypes,how_to_improve_interview_process,nan (Why answer: I don't answer but I want to),nan (Future Lang & Tech: Cordova),ProfessionalTech,ExCoderBalance,nan (Future Lang & Tech: Clojure),JobEmailPriorities3,nan (Compensation),AIAgentChallengesNeutral,nan (Who do you want to communicate with about a new job opportunity: Headhunter),JobEmailPriorities6,NEWPurpleLink,CousinEducation,nan (I like that I can indicate ads I want to see less of),OrgSize,MgrIdiot,Exercise,AIAgentImpactSomewhat disagree,"In receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (Message is personalized to me)",MgrMoney,WorkWeekHrs,SOHowMuchTime,nan (Current Lang & Tech: Perl),InfluenceDeptTech,LearnCode,JobSatPoints_13,nan (Windows Tablet),NEWStuck,nan (Blu-Ray),Select all that apply (Future Lang & Tech: Android),nan (Appealing message traits: Company culture described),HighestEducationParents,How often are you contacted by recruiters?,TechEndorse_13_TEXT,nan (Future Lang & Tech: Hadoop),SurveyTooLong,BlockchainOrg,nan (How many hours programming as hobby per week?),StackOverflowFoundAnswer,VersionControl,TechOppose_1,ImportantHiringTitles,SOVisitTo,nan (Current Lang & Tech: AngularJS),nan (Desktop Operating System),nan (Current Lang & Tech: C++),Which of our sites do you frequent most?,AdsPriorities2,VCInteraction,InfluenceServers,"nan ($25,001 - $40,000)",nan (Current Lang & Tech: Salesforce),AIAgentImpactStrongly disagree,HypotheticalTools1,nan (jQuery),nan (F#),nan (Future Lang & Tech: Swift),nan (Android Tablet),nan (How can companies improve interview process: Introduce me to boss),so_region,LanguageChoice,nan (Future Lang & Tech: LAMP),nan (How can companies improve interview process: Show me workplace),ProfessionalCloud,TechOppose_3,nan (Most important aspect of new job opportunity: Quality of colleagues),nan (Future Lang & Tech: R),nan (Current Lang & Tech: Clojure),developer_challenges,nan (No Involvement),HaveWorkedDatabase,Knowledge_6,JobSatPoints_5,nan (Future Lang & Tech: Dart),nan (Looking for a new job),YearsCodingProf,nan (Objective-C),nan (Consulting),SocialMedia,SO_Actions_15,nan (Android Phone),nan (Training & Education: Boot camp or night school),HomeRemote,AdsAgreeDisagree1,JobSatPoints_6,industry,nan (Wii U),How old are you?,JobSatPoints_1,nan (Source control used: write-in),nan (Stock Options/Profit Sharing Program),JobEmailPriorities5,nan (Android tablet),NEWOnboardGood,nan (Perception of contact form: Xing),NEWEdImpt,AIBen,nan (Most urgent info about job opportunity: Product details),DiversityImportant,nan (Current Lang & Tech: Spark),nan (Changed Jobs in last 12 Months),Please rate the advertising you've seen on Stack Overflow (The ads are relevant),"In an average week, how do you spend your time at work? (Developing new features)",nan (How can companies improve interview process: Fewer brainteasers),ExCoderActive,RightWrongWay,KinshipDevelopers,WebDeveloperType,nan (Future Lang & Tech: CoffeeScript),NEWCollabToolsAdmired,TechEndorseIntro,TimeFullyProductive,AssessJobDiversity,MiscTechDesireNextYear,LanguageHaveWorkedWith,JobSatPoints_16,UK_Country,nan (Current Lang & Tech: LAMP),agree_notice,AIDevWantToWorkWith,JobSatPoints_15,important_buildexisting,CodingActivities,VCHostingPersonal use,Do you have a Stack Overflow Careers Profile?,nan (I've taken a trial/purchased a product from ads),Dependents,nan (Most urgent info about job opportunity: Company name),team_size_range,nan (Future Lang & Tech: C++),ChallengeMyself,Select all that apply (Training & Education: No formal training),nan (Autonomy Over Budget/Expenditures),WebframeWorkedWith,nan (Customers),Knowledge_3,TechEndorse_13,JobContactPriorities2,nan (Perception of contact form: Email),nan (Product Managers),hobby,TechEndorse_3,StackOverflowAdsDistracting
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Consulting,,,,,,,,,,,Mobile,,,,,,,,,,,,,,,,,Software,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Start Up (1-25),,,,,,,FML,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,<$100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,Linux,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student / Unemployed,,,,Approver,Africa,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,<2,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$25,001 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,< 20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mature Small Business (25-100),,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,Windows 7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other netbook,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,SaaS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mid Sized (100-999),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,Linux,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,India,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I'm a Seller,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Foundation / Non-Profit,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Kindle,,,,,,,Student,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Haskell,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,Linux,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,,,,,,,,,,Wii,,,,,,Student / Unemployed,,,,,Germany,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other gaming system,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Mobile Phone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,< 20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,">$150,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,I run this place,,,"$41,000 - $75,000",,,,,,Android,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C++,,Perl,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Hardware,Software Products,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,Software,,,,,,,,,,,,,,,"$100,001 - $150,000",,,,,,,,,,,,,,,,,,,Kindle,,,,,,,Start Up (1-25),,,,,,,It pays the bills,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Servers,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,PHP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Executive (VP of Eng, CTO, CIO, etc.)",,,,,,,,Linux,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,CSS,,,,,,,,,,,,,,,"$80,000 - $100,000",,,,Approver,Other Asia,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Don't know,,,,,,,,,,,,,,,,,"$10,001 - $25,000",,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,"$75,001 - $100,000",,,,,,,,,,,,,,,JavaScript,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Purchaser,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Mobile Phone,,,,,Xbox,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,"$25,001 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,35-39,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Sample the dataframe (default - 20%)

In [10]:
# Create a stratified sample of 20% of the data
if combined_df is not None:
    # Calculate 20% sample size for each year
    sample_size = 0.2
    
    # Perform stratified sampling
    stratified_sample = combined_df.groupby('SurveyYear', group_keys=False).apply(
        lambda x: x.sample(frac=sample_size, random_state=42)
    ).reset_index(drop=True)
    
    # Export to CSV
    output_file = 'stackoverflow_survey_stratified_sample.csv'
    stratified_sample.to_csv(output_file, index=False)
    
    print(f"Original dataset size: {len(combined_df):,} rows")
    print(f"Sampled dataset size: {len(stratified_sample):,} rows")
    print(f"\nSample size by year:")
    print(stratified_sample['SurveyYear'].value_counts().sort_index())
    print(f"\nData exported to: {output_file}")

Original dataset size: 772,599 rows
Sampled dataset size: 154,521 rows

Sample size by year:
SurveyYear
2011      563
2012     1249
2013     1948
2014     1529
2015     5217
2016    11206
2017    10278
2018    19771
2019    17777
2020    12892
2021    16688
2022    14654
2023    17837
2024    13087
2025     9825
Name: count, dtype: int64

Data exported to: stackoverflow_survey_stratified_sample.csv


In [11]:
stratified_sample.to_csv('stackoverflow_survey_stratified_sample.csv')

### Adjust df_use based on the dataframe you want to use (sample_df, stratified_sample, etc)

In [12]:
sample_df = pd.read_csv('stackoverflow_survey_stratified_sample.csv')

KeyboardInterrupt: 

In [13]:
df_use = stratified_sample.copy()

### Adjust column names for Undefined/NaN

In [14]:
# Remove any columns with 'Unnamed' in their name from the combined dataframe
if df_use is not None:
    df_use = df_use.loc[:, ~df_use.columns.str.contains('^Unnamed')]

In [15]:
# Remove columns with 'nan' in their names from combined_df and individual year dataframes
if df_use is not None:
    # For combined dataframe
    nan_columns = df_use.columns[df_use.columns.str.contains('nan', case=False, na=False)]
    if len(nan_columns) > 0:
        print("Removing columns containing 'nan' from combined dataframe:")
        print(list(nan_columns))
        df_use = df_use.drop(columns=nan_columns)

print("\nDone cleaning column names.")

Removing columns containing 'nan' from combined dataframe:
['nan (>$150,000)', 'nan (Other tablet)', 'nan (Training & Education: Masters in CS)', 'nan (Most important aspect of new job opportunity: Office location)', 'nan (Training & Education: Mentorship)', "nan (Excitement About the Company's Products)", 'nan (Go)', 'nan (Current Lang & Tech: R)', "nan (Why use Stack Overflow: Can't do job without it)", 'nan (Remote Status)', 'nan (Future Lang & Tech: PHP)', 'nan (WinRT)', 'nan (Current Lang & Tech: Python)', 'nan (Redis)', 'nan (Why answer: Help future programmers)', 'nan (Current Lang & Tech: Java)', 'nan (Why use Stack Overflow: Maintain online presence)', 'nan (Looking for a job)', 'nan (Most important aspect of new job opportunity: Company reputation)', 'nan (Why try Stack Overflow Careers: Jobs are on Stack Overflow)', 'nan (Employment Status)', 'nan (TypeScript)', 'nan (Future Lang & Tech: C#)', 'nan (Link to a Stack Overflow Careers Company Page or other source of more inform

### Convert column names to lower case

In [16]:
# Update all column names in combined_df and each dataframe in dataframes to be lower case
df_use.columns = [col.lower() for col in df_use.columns]

In [17]:
df_use.head()

Unnamed: 0,expectedsalary,aitoolinterested in using,truefalse_1,sotagshaveworkedwith,toolstechwanttoworkwith,officestacksyncadmired,newcollabtoolsdesirenextyear,knowledge_7,ainextneither different nor similar,sexualorientation,adspriorities5,webframechoice,impsyn,embeddedhaveworkedwith,databasechoice,agree_legacy,opsys,officestacksyncwanttoworkwith,do you work remotely?,buynewtool,stackoverflowjobsrecommend,learncodeai,stackoverflowmetachat,checkincode,adspriorities1,influenceworkstation,assessjoboffice,newjobhunt,gender,jobemailpriorities4,webframewantentry,which languages are you proficient in? (java),annoyingui,stackoverflowjoblisting,aidangerous,commit_frequency,agree_loveboss,job_satisfaction,stackoverflowbetter,do you have a stack overflow careers 2.0 profile?,wantworkplatform,equipmentsatisfiedrw,employmentstatus,wantworklanguage,assessbenefits7,welcomechange,difficultcommunication,techlist,buildvsbuy,techoppose_5,rep_range,ainextno change,excoderskills,which desktop operating system do you use the most?,assessbenefits3,learncodeonline,frequency_2,importantbenefits,open_to_new_job,salary_range,inthezone,why_learn_new_tech,careersatisfaction,employment,newrole,influencerecruitment,remote,assessjob9,what country do you live in?,webframeadmired,stackoverflowsatisfaction,workexp,stackoverflowjobs,lastint,newdevopsimpt,frequency_3,sotagshaveentry,aichallenges,aitoolcurrently mostly ai,assessjobprojects,aimodelsadmired,languageadmired,jobsatpoints_15_text,platformhaveentry,student,jobsatpoints_14,aiagentknowwrite,how likely is it that a recommendation you make will be acted upon?,ethicsresponsible,"if your company has a native mobile app, what platforms do you support? (iphone)",aihuman,how would you best describe the industry you currently work in?,influencecommunication,important_sameend,stackoverflowanswer,learncodechoose,important_promotion,employment_status,select all that apply (current lang & tech: android),assessjob4,aiagentorchwrite,agree_alcohol,what advertisers do you remember seeing on stack overflow? (open-ended response),"you answered you don't have a careers profile, can you elaborate why?",dev_environment,dogs_vs_cats,officestackwantentry,select all that apply (why try stack overflow careers: no spam),so_actions_7,stackoverflowhelpful,problemsolving,devenvsadmired,aisearchdevadmired,aiagentobswrite,aiagentimpactneutral,"in an average week, how do you spend your time? (developing new features)",programming_ability,learnedhiring,q120,sojobs,sofriction,friendsdevelopers,university,stackoverflowadsrelevant,buildingthings,stackoverflowmoderation,sovisitfreq,ethnicity,onboarding,adspriorities3,waketime,learncodecoursescert,select up to 3 (how can companies improve interview process: more live code),surveylength,languageswantentry,assessjobcommute,stackoverflowhasaccount,jobcontactpriorities1,jobemailpriorities1,collaborateremote,assessjobtech,haveworkedlanguage,workremote,important_wfh,aitoolcurrently partially ai,stackoverflowdevices,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",purchasewhat,what types of purchases are you involved in? (hardware),how would you best describe the industry you work in?,understandcomputers,truefalse_3,databasehaveworkedwith,influencetechstack,aiinteresting,tech_want,assessjob2,devenvwantentry,majorundergrad,what type of project are you developing?,nondevelopertype,salary,newothercomms,betterlife,so_actions_10,misctechadmired,interestedanswers,adspriorities4,clickykeys,programhobby,jobsatpoints_7,adspriorities6,newsosites,aiexplain,otherpeoplescode,adblocker,jobsatpoints_10,knowledge_5,ethicsreport,blockchainis,where do you work remotely most of the time?,importanthiringcompanies,boringdetails,adblockerdisable,adblockerreasons,jobsatisfaction,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),assessjob10,jobsecurity,airesponsible,stackoverflowcommunity,commplatformwanttoworkwith,tabsspaces,offon,yearscodedjob,seriouswork,so_actions_6,which best describes the size of your company?,stackoverflowmakemoney,militaryus,visit_frequency,please rate your job/career satisfaction,excoder10years,projectmanagement,jobsatpoints_11,aiagentimpactstrongly agree,lasthiredate,have you visited / are you aware of stack overflow careers 2.0?,aiagentchallengesstrongly agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",opensource,how do you prefer to be contacted about job opportunities? (email),questionsinteresting,salarytype,newlearn,techoppose_9,assessjobcompensation,databasedesirenextyear,hourscomputer,convertedcompyearly,aiagentknowledge,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",have you visited / are you aware of stack overflow careers?,platformworkedwith,aisearchwanttoworkwith,frustration,equipmentsatisfiedstorage,webframehaveworkedwith,blockchain,educationparents,how large is the team that you work on?,skipmeals,toolcountpersonal,devenviron,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),platformwantentry,jobprofile,platformwanttoworkwith,how did you find out about your current job?,importanthiringrep,sotagsadmired,soaccount,age,ainextvery different,mentalhealth,stackoverflowvisit,coderevhrs,embeddedadmired,aisent,enjoydebugging,which technologies are you excited about? (node.js),race,assessjobleaders,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",desktop_os,mobiledevelopertype,importanthiringgettingthingsdone,are you currently looking for a job or open to new opportunities?,metricassess,how often do you find solutions to your programming problems on stack overflow without asking a new question?,jobseekingstatus,ainextsomewhat similar,newcollabtoolsworkedwith,assessjob3,techendorse_4,so_actions_3,what other departments / roles do you interact with regularly? (system administrators),age_midpoint,stackoverflowdescribes,techendorse_6,techoppose_11,excoderwillnotcode,were you aware of the apptivate contest?,aitoolplan to mostly use ai,which us state or territory do you live in?,webframeworkedwith,languageworkedwith,occupation,ergonomicdevices,equipmentsatisfiedram,newofftopic,aiagentorchestration,important_variety,country,jobcontactpriorities3,newovertime,platformhaveworkedwith,screenname,sexuality,hoursperweek,what is your current stack overflow reputation?,soai,devenvswanttoworkwith,devtype,vchostingprofessional use,officestackasyncadmired,important_newtech,assessjobexp,country.1,currency,overpaid,sotagswant entry,select all that apply (most urgent info about job opportunity: salary),importanthiringalgorithms,edlevel,misctechwanttoworkwith,resumeprompted,ethicalimplications,self_identification,responseid,aisearchdevhaveworkedwith,officestackasynchaveworkedwith,icorpm,collector,assessjobdept,workloc,wantworkdatabase,devenvshaveworkedwith,assessjobproduct,workchallenge,jobsearchstatus,importanthiringopensource,so_actions_4,knowledge_8,select up to 3 (appealing message traits: message is personalized),"in the last 12 months, how much money have you spent on personal technology-related purchases?",aifrustration,jobemailpriorities2,importanthiringpmexp,assessjobrole,select all that apply (source control used: git),sovisit1st,experience_range,why_stack_overflow,yearscodedjobpast,aithreat,assessbenefits8,stackoverflowdevstory,aiagentchallengessomewhat disagree,education,remotework,jobseek,aiopen,lastnewjob,agreedisagree1,which of the following best describes your occupation?,commplatformadmired,age_range,assessjob7,jobsatpoints_9,assessbenefits4,what operating system do you use the most?,influenceviztools,sotimesaved,surveyease,jobsatpoints_8,jobsat,truefalse_2,industry,ethicschoice,languagewanttoworkwith,"including bonus, what is your annual compensation in usd?",important_buildnew,important_companymission,what country or region do you live in?,coderev,techendorse_2,stackoverflowparticipate,us_state,aiagentimpactsomewhat agree,opsyspersonal use,convertedsalary,formaleducation,devenvhaveentry,devenvschoice,agree_adblocker,undergradmajor,so_actions_15_text,aiagentextwrite,hobby,women_on_team,so_actions_9,aicomplex,hoursoutside,hypotheticaltools3,platformchoice,aiagentchallengesstrongly disagree,jobcontactpriorities5,agree_tech,aitoolnot interested in using,assessjobprofdevel,assessjob6,surveyyear,experience_midpoint,newjobhuntresearch,ainextmuch more integrated,excodernotforme,did you participate in the apptivate contest?,sonewcontent,excoderbelonged,webframedesirenextyear,how many years of it/programming experience do you have?,toolcountwork,questionsconfusing,hypotheticaltools5,important_control,raceethnicity,aimodelschoice,embeddedwanttoworkwith,techoppose_13,hobbyist,learningnewtech,auditoryenvironment,stackoverflowrecommend,which technology products do you own? (you can choose more than one) (iphone),interview_likelihood,aiagentchange,sotagswanttoworkwith,influencecloud,convertedcomp,socomm,numbermonitors,workpaycare,equipmentsatisfiedcpu,aiacc,officestacksynchaveworkedwith,hypotheticaltools2,what is your gender?,how many people work for your company?,pronouncegif,assessjob1,techoppose_16,aimodelshaveentry,assessbenefits11,aisearchdevwanttoworkwith,techoppose_15,occupation_group,stackoverflowconsidermember,officestackasyncwanttoworkwith,check,so_actions_5,aitoolcurrently using,influenceconsultants,hopefiveyears,aliens,sohow,sofindanswer,commplatformwantentr,influencehardware,agreedisagree3,importanthiringcommunication,webframedesirenextyear.1,adsagreedisagree3,yearscode,so_actions_16,techdoc,aimodelswanttoworkwith,new_job_value,how often do you visit job boards?,knowledge_2,assessbenefits6,updatecv,which of the following languages or technologies have you used significantly in the past year? (c),ainextvery similar,newpurchaseresearch,agentusesgeneral,newcollabtoolswanttoworkwith,do you enjoy working remotely?,databasehaveentry,accessibility,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",salary_midpoint,fizzbuzz,equipmentsatisfiedmonitors,"including yourself, how many developers are employed at your company?",professional,surveyeasy,aiagentchallengessomewhat agree,agree_problemsolving,techendorse,databasewantentry,aimodelswantentry,changeworld,assessbenefits1,yearscoding,what best describes your career / job satisfaction?,jobcontactpriorities4,operatingsystem,aiselect,excoderreturn,investtimetools,opsysprofessional use,languagedesirenextyear,ainextmore integrated,assessbenefits2,influencedatabase,selftaughttypes,misctechhaveworkedwith,techendorse_1,misctechworkedwith,methodology,mgrwant,so_actions_1,frequency_1,select up to 3 (most annoying about job search: finding time),aimodelshaveworkedwith,surveylong,purchasehow,toolstechhaveworkedwith,databasewanttoworkwith,aiagentobservesecure,job_discovery,aitooldon't plan to use ai for this task,techendorse_9,yearscodepro,trans,haveworkedframework,aiagents,shipit,timeafterbootcamp,adspriorities7,aidevhaveworkedwith,assessjobindustry,wantworkframework,assessbenefits5,stackoverflowjobsearch,databaseworkedwith,aitoolplan to partially use ai,languageshaveentry,ainextmuch less integrated,gender.1,techendorse_7,job_search_annoyance,how many developers are employed at your company?,importanthiringeducation,webframehaveentry,big_mac_index,haveworkedplatform,mainbranch,frameworkdesirenextyear,purchaseinfluence,jobemailpriorities7,agreedisagree2,aiagent_uses,sopartfreq,agree_diversity,companytype,age1stcode,knowledge_4,select all that apply (why answer: help a programmer in need),un_subregion,platformdesirenextyear,compfreq,techoppose_15_text,ide,employmentaddl,ailearnhow,hypotheticaltools4,adsagreedisagree2,select all that apply (why use stack overflow: help for job),careersat,comptotal,tbranch,communicationtools,star_wars_vs_star_trek,workstart,knowledge_9,assessbenefits9,competepeers,currencydesc,stackoverflowcopiedcode,yearsprogram,stackoverflowwhatdo,currencysymbol,techoppose_7,aifuture,assessjobremote,soduration,developertype,resumeupdate,important_ownoffice,educationimportant,webframewanttoworkwith,opensourcer,respondent,ainextsomewhat different,workplan,entteams,techoppose_2,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",importanthiringtechexp,hackathonreasons,knowledge_1,timeanswering,how do you use stack overflow? (read other people's questions to solve my problems),toolstechadmired,professionalquestion,containers,databaseadmired,companysize,influenceinternet,eduother,agree_nightcode,timesearching,techendorse_8,unit_testing,stackoverflownewquestion,adsactions,commplatformhaveentr,company_size_range,unittests,versioncontrolsystem,platformadmired,aiethics,what is your involvement in purchasing? you can choose more than 1. (influencer),have you changed jobs in the last 12 months?,agree_mars,techendorse_5,aiagentexternal,assessbenefits10,aisearchhaveworkedwith,tech_do,extraversion,so_dev_content,ainextless integrated,newcollabtoolshaveworkedwith,officestackhaveentry,newdevops,jobfactors,stackoverflowcompanypage,frameworkworkedwith,itperson,jobsatpoints_4,select up to 3 (most important aspect of new job opportunity: salary),commplatformhaveworkedwith,assessjob5,assessjob8,educationtypes,how_to_improve_interview_process,professionaltech,excoderbalance,jobemailpriorities3,aiagentchallengesneutral,jobemailpriorities6,newpurplelink,cousineducation,orgsize,mgridiot,exercise,aiagentimpactsomewhat disagree,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",mgrmoney,workweekhrs,sohowmuchtime,influencedepttech,learncode,jobsatpoints_13,newstuck,select all that apply (future lang & tech: android),highesteducationparents,how often are you contacted by recruiters?,techendorse_13_text,surveytoolong,blockchainorg,stackoverflowfoundanswer,versioncontrol,techoppose_1,importanthiringtitles,sovisitto,which of our sites do you frequent most?,adspriorities2,vcinteraction,influenceservers,aiagentimpactstrongly disagree,hypotheticaltools1,so_region,languagechoice,professionalcloud,techoppose_3,developer_challenges,haveworkeddatabase,knowledge_6,jobsatpoints_5,yearscodingprof,socialmedia,so_actions_15,homeremote,adsagreedisagree1,jobsatpoints_6,industry.1,how old are you?,jobsatpoints_1,jobemailpriorities5,newonboardgood,newedimpt,aiben,diversityimportant,please rate the advertising you've seen on stack overflow (the ads are relevant),"in an average week, how do you spend your time at work? (developing new features)",excoderactive,rightwrongway,kinshipdevelopers,webdevelopertype,newcollabtoolsadmired,techendorseintro,timefullyproductive,assessjobdiversity,misctechdesirenextyear,languagehaveworkedwith,jobsatpoints_16,uk_country,agree_notice,aidevwanttoworkwith,jobsatpoints_15,important_buildexisting,codingactivities,vchostingpersonal use,do you have a stack overflow careers profile?,dependents,team_size_range,challengemyself,select all that apply (training & education: no formal training),webframeworkedwith.1,knowledge_3,techendorse_13,jobcontactpriorities2,hobby.1,techendorse_3,stackoverflowadsdistracting
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Services,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Start Up (1-25),,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,California,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,Linux,,,,,,,,,,,,,United States of America,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,11,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Start Up (1-25),,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$2,001-$3,000",,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,Mac OS X,,,,,,,,,,"$40,000 - $60,000",,,Other Europe,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,11,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,40-50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mature Small Business (25-100),,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,Windows 7,,,,,,,,,,"$20,000 - $40,000",,,South America,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,11,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Healthcare,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mid Sized (100-999),,,,FML,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,Mac OS X,,,,,,,,,,"$60,000 - $80,000",,,Other Asia,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Other (not working, consultant, etc.)",,,,I enjoy going to work,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,,,Student,,,,,,Linux,,,,,,,,,,"<$20,000",,,Other Europe,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Identify and combine like columns

In [18]:
# Dictionary of column groups to combine
column_groups = {
    'years_coding': ['yearscode', 'yearscodingprof', 'yearscodepro', 'work_experience', 'yearsprogram', 'yearscodedjob', 'how many years of it/programming experience do you have?'],
    'education': ['edlevel', 'education', 'formaleducation'],
    'employment': ['employment', 'employmentstatus', 'employment_status'],
    'company_size': ['companysize', 'company_size_range', 'orgsize', 'companyemployeesrange', 'how many people work for your company?', 'which best describes the size of your company?'],
    'salary': ['convertedsalary', 'convertedcomp', 'comptotal'],
    'job_satisfaction': ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction', 'please rate your job/career satisfaction'],
    'job_title': ['jobtitle', 'currentjobtitle', 'jobprofile'],
    'developer_type': ['developertype', 'devtype'],
    'industry': ['industry', 'industrytype', 'companytype', 'how would you best describe the industry you work in?', 'how would you best describe the industry you currently work in?'],
    'country': ['country', 'location', 'countrycode', 'what country or region do you live in?', 'what country do you live in?'],
    'programming_experience': ['yearscode', 'yearscoding', 'codingexperience'],
    'database_worked_with': ['databaseworkedwith', 'dbworkedwith'],
    'dev_environment': ['ide', 'developmentenvironment', 'dev_environment', 'devenviron', 'devenvironment'],
    'operating_sys': ['opsys', 'operatingsystem', 'os', 'what operating system do you use the most?', 'which desktop operating system do you use the most?'],
    'dev_methodology': ['methodology', 'devmethodology', 'developmentmethodology'],
    'communication_tools': ['communicationtools', 'collaboration', 'collabtools'],
    'gender': ['gender', 'sex', 'what is your gender?'],
    'age': ['age', 'agerange','age_range','agegrouping', 'how old are you?'],
    'learning': ['learncode', 'learncodehow', 'learningmethod'],
    'work_experience': ['workexp', 'experience', 'yearsexperience'],
    'remote': ['remotework', 'workremote', 'remotestatus', 'do you work remotely?', 'homeremote', 'remote'],
    'team_size': ['teamsize', 'orgteamsize', 'developmentteamsize', 'how large is the team that you work on?'],
    'survey_easy': ['surveyease', 'surveyeasy', 'surveylong', 'surveytoolong', 'surveylength'],
    'version_control_sys': ['versioncontrol', 'versioncontrolsystem', 'vcs'],
    'currency': ['currency', 'currencydesc'],
    'hobby': ['hobby', 'hobbyist'],
    'race': ['race', 'raceethnicity', 'self_identification']
}

In [19]:
df_use_2 = df_use.copy()

In [20]:
# Identify and combine duplicate columns
def combine_duplicate_columns(df):
    """
    Identifies columns with the same name (case-insensitive), combines their data
    into a single column by taking the first non-null value, and removes the originals.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with duplicate columns combined.
    """
    df_combined_duplicates = pd.DataFrame(index=df.index)
    processed_columns = set()

    for col_name in df.columns:
        col_name_lower = col_name.lower()

        if col_name_lower not in processed_columns:
            # Find all columns with this name (case-insensitive)
            duplicate_columns = [col for col in df.columns if col.lower() == col_name_lower]

            if len(duplicate_columns) > 1:
                print(f"Combining duplicate columns for '{col_name_lower}': {duplicate_columns}")
                # Select the duplicate columns
                selected_duplicates = df[duplicate_columns]
                # Combine by taking the first non-null value across rows
                combined_series = selected_duplicates.bfill(axis=1).iloc[:, 0]
                df_combined_duplicates[col_name_lower] = combined_series
                # Add to processed set
                processed_columns.add(col_name_lower)
            else:
                # Not a duplicate, just add the column
                df_combined_duplicates[col_name_lower] = df[col_name]
                processed_columns.add(col_name_lower)

    return df_combined_duplicates

# Apply the function to combine duplicate columns in df_use_2
df_use_combined_duplicates = combine_duplicate_columns(df_use_2)

print("\nOriginal DataFrame shape:", df_use_2.shape)
print("DataFrame shape after combining duplicates:", df_use_combined_duplicates.shape)

Combining duplicate columns for 'gender': ['gender', 'gender']
Combining duplicate columns for 'webframeworkedwith': ['webframeworkedwith', 'webframeworkedwith']
Combining duplicate columns for 'country': ['country', 'country']
Combining duplicate columns for 'industry': ['industry', 'industry']
Combining duplicate columns for 'hobby': ['hobby', 'hobby']
Combining duplicate columns for 'webframedesirenextyear': ['webframedesirenextyear', 'webframedesirenextyear']

Original DataFrame shape: (154521, 729)
DataFrame shape after combining duplicates: (154521, 723)


In [21]:
def combine_columns(df, column_list):
    """
    Combines data from a list of columns into a single Series,
    taking the first non-null value across the columns for each row.

    Args:
        df: The input pandas DataFrame.
        column_list: A list of column names to combine.

    Returns:
        A pandas Series containing the combined data.
    """
    # Select the specified columns
    selected_columns = df[column_list]

    # Combine columns by taking the first non-null value across rows
    combined_series = selected_columns.bfill(axis=1).iloc[:, 0]

    return combined_series

In [22]:
# Initialize an empty dictionary to store consolidated columns
consolidated_columns_dict = {}

# Iterate through the column_groups dictionary
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]

    # If there are columns from the current group present in the DataFrame
    if present_columns:
        print(f"Processing group '{group_name}' with columns: {present_columns}")
        # Call the combine_columns function
        combined_series = combine_columns(df_use_combined_duplicates, present_columns)
        # Store the resulting combined Series in the dictionary
        consolidated_columns_dict[group_name] = combined_series
    else:
        print(f"No columns found for group '{group_name}' in the DataFrame.")

# Create a new DataFrame from the dictionary of consolidated columns
df_consolidated = pd.DataFrame(consolidated_columns_dict)

print("\nConsolidated DataFrame created.")
print(f"Shape of consolidated DataFrame: {df_consolidated.shape}")
df_consolidated.head()

Processing group 'years_coding' with columns: ['yearscode', 'yearscodingprof', 'yearscodepro', 'yearsprogram', 'yearscodedjob', 'how many years of it/programming experience do you have?']
Processing group 'education' with columns: ['edlevel', 'education', 'formaleducation']
Processing group 'employment' with columns: ['employment', 'employmentstatus', 'employment_status']
Processing group 'company_size' with columns: ['companysize', 'company_size_range', 'orgsize', 'how many people work for your company?', 'which best describes the size of your company?']
Processing group 'salary' with columns: ['convertedsalary', 'convertedcomp', 'comptotal']
Processing group 'job_satisfaction' with columns: ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction', 'please rate your job/career satisfaction']
Processing group 'job_title' with columns: ['jobprofile']
Processing group 'developer_type' with columns: ['developertype', 'devtype']
Processing group 'industry' with columns: ['industry', 'c

Unnamed: 0,years_coding,education,employment,company_size,salary,job_satisfaction,job_title,developer_type,industry,country,programming_experience,database_worked_with,dev_environment,operating_sys,dev_methodology,communication_tools,gender,age,learning,work_experience,remote,team_size,survey_easy,version_control_sys,currency,hobby,race
0,11,,,Start Up (1-25),,So happy it hurts,,,Web Services,United States of America,,,,Linux,,,,30-34,,,,,,,,,
1,11,,,Start Up (1-25),,I enjoy going to work,,,Software Products,Other Europe,,,,Mac OS X,,,,40-50,,,,,,,,,
2,11,,,Mature Small Business (25-100),,So happy it hurts,,,Software Products,South America,,,,Windows 7,,,,30-34,,,,,,,,,
3,41435,,,Mid Sized (100-999),,FML,,,Healthcare,Other Asia,,,,Mac OS X,,,,30-34,,,,,,,,,
4,41310,,,"Other (not working, consultant, etc.)",,I enjoy going to work,,,Other,Other Europe,,,,Linux,,,,25-29,,,,,,,,,


In [23]:
# Create a list of all original column names that were combined
original_columns_to_drop = []
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]
    original_columns_to_drop.extend(present_columns)

# Remove duplicates from the list of columns to drop
original_columns_to_drop = list(set(original_columns_to_drop))

# Drop these original columns from the df_use_combined_duplicates DataFrame
df_use_combined_duplicates_dropped = df_use_combined_duplicates.drop(columns=original_columns_to_drop, errors='ignore')

print("Original columns that were combined have been dropped.")
print(f"Shape of df_use_combined_duplicates after dropping original columns: {df_use_combined_duplicates_dropped.shape}")

Original columns that were combined have been dropped.
Shape of df_use_combined_duplicates after dropping original columns: (154521, 651)


In [24]:
# Concatenate the remaining columns from df_use_combined_duplicates_dropped with the consolidated columns
df_all = pd.concat([df_use_combined_duplicates_dropped, df_consolidated], axis=1)

print("\nFinal DataFrame created by concatenating remaining original columns and consolidated columns.")
print(f"Shape of the final DataFrame: {df_all.shape}")

# Display the first few rows of the final dataframe
print("\nFirst 5 rows of the final DataFrame:")
display(df_all.head())


Final DataFrame created by concatenating remaining original columns and consolidated columns.
Shape of the final DataFrame: (154521, 678)

First 5 rows of the final DataFrame:


Unnamed: 0,expectedsalary,aitoolinterested in using,truefalse_1,sotagshaveworkedwith,toolstechwanttoworkwith,officestacksyncadmired,newcollabtoolsdesirenextyear,knowledge_7,ainextneither different nor similar,sexualorientation,adspriorities5,webframechoice,impsyn,embeddedhaveworkedwith,databasechoice,agree_legacy,officestacksyncwanttoworkwith,buynewtool,stackoverflowjobsrecommend,learncodeai,stackoverflowmetachat,checkincode,adspriorities1,influenceworkstation,assessjoboffice,newjobhunt,jobemailpriorities4,webframewantentry,which languages are you proficient in? (java),annoyingui,stackoverflowjoblisting,aidangerous,commit_frequency,agree_loveboss,stackoverflowbetter,do you have a stack overflow careers 2.0 profile?,wantworkplatform,equipmentsatisfiedrw,wantworklanguage,assessbenefits7,welcomechange,difficultcommunication,techlist,buildvsbuy,techoppose_5,rep_range,ainextno change,excoderskills,assessbenefits3,learncodeonline,frequency_2,importantbenefits,open_to_new_job,salary_range,inthezone,why_learn_new_tech,newrole,influencerecruitment,assessjob9,webframeadmired,stackoverflowsatisfaction,stackoverflowjobs,lastint,newdevopsimpt,frequency_3,sotagshaveentry,aichallenges,aitoolcurrently mostly ai,assessjobprojects,aimodelsadmired,languageadmired,jobsatpoints_15_text,platformhaveentry,student,jobsatpoints_14,aiagentknowwrite,how likely is it that a recommendation you make will be acted upon?,ethicsresponsible,"if your company has a native mobile app, what platforms do you support? (iphone)",aihuman,influencecommunication,important_sameend,stackoverflowanswer,learncodechoose,important_promotion,select all that apply (current lang & tech: android),assessjob4,aiagentorchwrite,agree_alcohol,what advertisers do you remember seeing on stack overflow? (open-ended response),"you answered you don't have a careers profile, can you elaborate why?",dogs_vs_cats,officestackwantentry,select all that apply (why try stack overflow careers: no spam),so_actions_7,stackoverflowhelpful,problemsolving,devenvsadmired,aisearchdevadmired,aiagentobswrite,aiagentimpactneutral,"in an average week, how do you spend your time? (developing new features)",programming_ability,learnedhiring,q120,sojobs,sofriction,friendsdevelopers,university,stackoverflowadsrelevant,buildingthings,stackoverflowmoderation,sovisitfreq,ethnicity,onboarding,adspriorities3,waketime,learncodecoursescert,select up to 3 (how can companies improve interview process: more live code),languageswantentry,assessjobcommute,stackoverflowhasaccount,jobcontactpriorities1,jobemailpriorities1,collaborateremote,assessjobtech,haveworkedlanguage,important_wfh,aitoolcurrently partially ai,stackoverflowdevices,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",purchasewhat,what types of purchases are you involved in? (hardware),understandcomputers,truefalse_3,databasehaveworkedwith,influencetechstack,aiinteresting,tech_want,assessjob2,devenvwantentry,majorundergrad,what type of project are you developing?,nondevelopertype,salary,newothercomms,betterlife,so_actions_10,misctechadmired,interestedanswers,adspriorities4,clickykeys,programhobby,jobsatpoints_7,adspriorities6,newsosites,aiexplain,otherpeoplescode,adblocker,jobsatpoints_10,knowledge_5,ethicsreport,blockchainis,where do you work remotely most of the time?,importanthiringcompanies,boringdetails,adblockerdisable,adblockerreasons,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),assessjob10,jobsecurity,airesponsible,stackoverflowcommunity,commplatformwanttoworkwith,tabsspaces,offon,seriouswork,so_actions_6,stackoverflowmakemoney,militaryus,visit_frequency,excoder10years,projectmanagement,jobsatpoints_11,aiagentimpactstrongly agree,lasthiredate,have you visited / are you aware of stack overflow careers 2.0?,aiagentchallengesstrongly agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",opensource,how do you prefer to be contacted about job opportunities? (email),questionsinteresting,salarytype,newlearn,techoppose_9,assessjobcompensation,databasedesirenextyear,hourscomputer,convertedcompyearly,aiagentknowledge,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",have you visited / are you aware of stack overflow careers?,platformworkedwith,aisearchwanttoworkwith,frustration,equipmentsatisfiedstorage,webframehaveworkedwith,blockchain,educationparents,skipmeals,toolcountpersonal,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),platformwantentry,platformwanttoworkwith,how did you find out about your current job?,importanthiringrep,sotagsadmired,soaccount,ainextvery different,mentalhealth,stackoverflowvisit,coderevhrs,embeddedadmired,aisent,enjoydebugging,which technologies are you excited about? (node.js),assessjobleaders,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",desktop_os,mobiledevelopertype,importanthiringgettingthingsdone,are you currently looking for a job or open to new opportunities?,metricassess,how often do you find solutions to your programming problems on stack overflow without asking a new question?,jobseekingstatus,ainextsomewhat similar,newcollabtoolsworkedwith,assessjob3,techendorse_4,so_actions_3,what other departments / roles do you interact with regularly? (system administrators),age_midpoint,stackoverflowdescribes,techendorse_6,techoppose_11,excoderwillnotcode,were you aware of the apptivate contest?,aitoolplan to mostly use ai,which us state or territory do you live in?,webframeworkedwith,languageworkedwith,occupation,ergonomicdevices,equipmentsatisfiedram,newofftopic,aiagentorchestration,important_variety,jobcontactpriorities3,newovertime,platformhaveworkedwith,screenname,sexuality,hoursperweek,what is your current stack overflow reputation?,soai,devenvswanttoworkwith,vchostingprofessional use,officestackasyncadmired,important_newtech,assessjobexp,overpaid,sotagswant entry,select all that apply (most urgent info about job opportunity: salary),importanthiringalgorithms,misctechwanttoworkwith,resumeprompted,ethicalimplications,responseid,aisearchdevhaveworkedwith,officestackasynchaveworkedwith,icorpm,collector,assessjobdept,workloc,wantworkdatabase,devenvshaveworkedwith,assessjobproduct,workchallenge,jobsearchstatus,importanthiringopensource,so_actions_4,knowledge_8,select up to 3 (appealing message traits: message is personalized),"in the last 12 months, how much money have you spent on personal technology-related purchases?",aifrustration,jobemailpriorities2,importanthiringpmexp,assessjobrole,select all that apply (source control used: git),sovisit1st,experience_range,why_stack_overflow,yearscodedjobpast,aithreat,assessbenefits8,stackoverflowdevstory,aiagentchallengessomewhat disagree,jobseek,aiopen,lastnewjob,agreedisagree1,which of the following best describes your occupation?,commplatformadmired,assessjob7,jobsatpoints_9,assessbenefits4,influenceviztools,sotimesaved,jobsatpoints_8,jobsat,truefalse_2,ethicschoice,languagewanttoworkwith,"including bonus, what is your annual compensation in usd?",important_buildnew,important_companymission,coderev,techendorse_2,stackoverflowparticipate,us_state,aiagentimpactsomewhat agree,opsyspersonal use,devenvhaveentry,devenvschoice,agree_adblocker,undergradmajor,so_actions_15_text,aiagentextwrite,women_on_team,so_actions_9,aicomplex,hoursoutside,hypotheticaltools3,platformchoice,aiagentchallengesstrongly disagree,jobcontactpriorities5,agree_tech,aitoolnot interested in using,assessjobprofdevel,assessjob6,surveyyear,experience_midpoint,newjobhuntresearch,ainextmuch more integrated,excodernotforme,did you participate in the apptivate contest?,sonewcontent,excoderbelonged,webframedesirenextyear,toolcountwork,questionsconfusing,hypotheticaltools5,important_control,aimodelschoice,embeddedwanttoworkwith,techoppose_13,learningnewtech,auditoryenvironment,stackoverflowrecommend,which technology products do you own? (you can choose more than one) (iphone),interview_likelihood,aiagentchange,sotagswanttoworkwith,influencecloud,socomm,numbermonitors,workpaycare,equipmentsatisfiedcpu,aiacc,officestacksynchaveworkedwith,hypotheticaltools2,pronouncegif,assessjob1,techoppose_16,aimodelshaveentry,assessbenefits11,aisearchdevwanttoworkwith,techoppose_15,occupation_group,stackoverflowconsidermember,officestackasyncwanttoworkwith,check,so_actions_5,aitoolcurrently using,influenceconsultants,hopefiveyears,aliens,sohow,sofindanswer,commplatformwantentr,influencehardware,agreedisagree3,importanthiringcommunication,adsagreedisagree3,so_actions_16,techdoc,aimodelswanttoworkwith,new_job_value,how often do you visit job boards?,knowledge_2,assessbenefits6,updatecv,which of the following languages or technologies have you used significantly in the past year? (c),ainextvery similar,newpurchaseresearch,agentusesgeneral,newcollabtoolswanttoworkwith,do you enjoy working remotely?,databasehaveentry,accessibility,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",salary_midpoint,fizzbuzz,equipmentsatisfiedmonitors,"including yourself, how many developers are employed at your company?",professional,aiagentchallengessomewhat agree,agree_problemsolving,techendorse,databasewantentry,aimodelswantentry,changeworld,assessbenefits1,what best describes your career / job satisfaction?,jobcontactpriorities4,aiselect,excoderreturn,investtimetools,opsysprofessional use,languagedesirenextyear,ainextmore integrated,assessbenefits2,influencedatabase,selftaughttypes,misctechhaveworkedwith,techendorse_1,misctechworkedwith,mgrwant,so_actions_1,frequency_1,select up to 3 (most annoying about job search: finding time),aimodelshaveworkedwith,purchasehow,toolstechhaveworkedwith,databasewanttoworkwith,aiagentobservesecure,job_discovery,aitooldon't plan to use ai for this task,techendorse_9,trans,haveworkedframework,aiagents,shipit,timeafterbootcamp,adspriorities7,aidevhaveworkedwith,assessjobindustry,wantworkframework,assessbenefits5,stackoverflowjobsearch,aitoolplan to partially use ai,languageshaveentry,ainextmuch less integrated,techendorse_7,job_search_annoyance,how many developers are employed at your company?,importanthiringeducation,webframehaveentry,big_mac_index,haveworkedplatform,mainbranch,frameworkdesirenextyear,purchaseinfluence,jobemailpriorities7,agreedisagree2,aiagent_uses,sopartfreq,agree_diversity,age1stcode,knowledge_4,select all that apply (why answer: help a programmer in need),un_subregion,platformdesirenextyear,compfreq,techoppose_15_text,employmentaddl,ailearnhow,hypotheticaltools4,adsagreedisagree2,select all that apply (why use stack overflow: help for job),careersat,tbranch,star_wars_vs_star_trek,workstart,knowledge_9,assessbenefits9,competepeers,stackoverflowcopiedcode,stackoverflowwhatdo,currencysymbol,techoppose_7,aifuture,assessjobremote,soduration,resumeupdate,important_ownoffice,educationimportant,webframewanttoworkwith,opensourcer,respondent,ainextsomewhat different,workplan,entteams,techoppose_2,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",importanthiringtechexp,hackathonreasons,knowledge_1,timeanswering,how do you use stack overflow? (read other people's questions to solve my problems),toolstechadmired,professionalquestion,containers,databaseadmired,influenceinternet,eduother,agree_nightcode,timesearching,techendorse_8,unit_testing,stackoverflownewquestion,adsactions,commplatformhaveentr,unittests,platformadmired,aiethics,what is your involvement in purchasing? you can choose more than 1. (influencer),have you changed jobs in the last 12 months?,agree_mars,techendorse_5,aiagentexternal,assessbenefits10,aisearchhaveworkedwith,tech_do,extraversion,so_dev_content,ainextless integrated,newcollabtoolshaveworkedwith,officestackhaveentry,newdevops,jobfactors,stackoverflowcompanypage,frameworkworkedwith,itperson,jobsatpoints_4,select up to 3 (most important aspect of new job opportunity: salary),commplatformhaveworkedwith,assessjob5,assessjob8,educationtypes,how_to_improve_interview_process,professionaltech,excoderbalance,jobemailpriorities3,aiagentchallengesneutral,jobemailpriorities6,newpurplelink,cousineducation,mgridiot,exercise,aiagentimpactsomewhat disagree,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",mgrmoney,workweekhrs,sohowmuchtime,influencedepttech,jobsatpoints_13,newstuck,select all that apply (future lang & tech: android),highesteducationparents,how often are you contacted by recruiters?,techendorse_13_text,blockchainorg,stackoverflowfoundanswer,techoppose_1,importanthiringtitles,sovisitto,which of our sites do you frequent most?,adspriorities2,vcinteraction,influenceservers,aiagentimpactstrongly disagree,hypotheticaltools1,so_region,languagechoice,professionalcloud,techoppose_3,developer_challenges,haveworkeddatabase,knowledge_6,jobsatpoints_5,socialmedia,so_actions_15,adsagreedisagree1,jobsatpoints_6,jobsatpoints_1,jobemailpriorities5,newonboardgood,newedimpt,aiben,diversityimportant,please rate the advertising you've seen on stack overflow (the ads are relevant),"in an average week, how do you spend your time at work? (developing new features)",excoderactive,rightwrongway,kinshipdevelopers,webdevelopertype,newcollabtoolsadmired,techendorseintro,timefullyproductive,assessjobdiversity,misctechdesirenextyear,languagehaveworkedwith,jobsatpoints_16,uk_country,agree_notice,aidevwanttoworkwith,jobsatpoints_15,important_buildexisting,codingactivities,vchostingpersonal use,do you have a stack overflow careers profile?,dependents,team_size_range,challengemyself,select all that apply (training & education: no formal training),knowledge_3,techendorse_13,jobcontactpriorities2,techendorse_3,stackoverflowadsdistracting,years_coding,education,employment,company_size,salary.1,job_satisfaction,job_title,developer_type,industry,country,programming_experience,database_worked_with,dev_environment,operating_sys,dev_methodology,communication_tools,gender,age,learning,work_experience,remote,team_size,survey_easy,version_control_sys,currency,hobby,race
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,California,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Start Up (1-25),,So happy it hurts,,,Web Services,United States of America,,,,Linux,,,,30-34,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$2,001-$3,000",,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,"$40,000 - $60,000",,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Start Up (1-25),,I enjoy going to work,,,Software Products,Other Europe,,,,Mac OS X,,,,40-50,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,"$20,000 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,,Mature Small Business (25-100),,So happy it hurts,,,Software Products,South America,,,,Windows 7,,,,30-34,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,"$60,000 - $80,000",,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41435,,,Mid Sized (100-999),,FML,,,Healthcare,Other Asia,,,,Mac OS X,,,,30-34,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,"<$20,000",,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41310,,,"Other (not working, consultant, etc.)",,I enjoy going to work,,,Other,Other Europe,,,,Linux,,,,25-29,,,,,,,,,


In [25]:
combined_consolidated_df = df_all.copy()

### Consolidate values for country

In [26]:
# Find columns containing 'country'
country_columns = [col for col in df_use.columns if 'country' in str(col).lower()]

print("Columns containing 'country':")
for col in country_columns:
    try:
        # Select the column(s). If multiple columns share the same name this returns a DataFrame.
        selected = df_use.loc[:, col]
        # If a DataFrame is returned (duplicate column names), collapse to a single Series by taking
        # the first non-null value across duplicates for each row.
        if isinstance(selected, pd.DataFrame):
            if selected.shape[1] > 1:
                print(f"\nWarning: column name '{col}' is duplicated ({selected.shape[1]} columns). Combining duplicates by taking first non-null value.")
            series = selected.bfill(axis=1).iloc[:, 0]
        else:
            series = selected.squeeze()

        print(f"\nColumn: {col}")
        print("Top 5 most common values and their counts:")
        counts = series.fillna('NULL').value_counts().head()
        print(counts)

        # Get unique count excluding nulls
        unique_count = series.dropna().nunique()
        print(f"\nTotal unique values (excluding nulls): {unique_count}")
        print(f"Number of null values: {series.isnull().sum()}")
        print("-" * 50)
    except Exception as e:
        print(f"\nError processing column {col}: {str(e)}")
        print("-" * 50)


Columns containing 'country':

Column: what country do you live in?
Top 5 most common values and their counts:
what country do you live in?
NULL              152992
United States        400
India                166
United Kingdom       143
Germany               85
Name: count, dtype: int64

Total unique values (excluding nulls): 80
Number of null values: 152992
--------------------------------------------------


Column: country
Top 5 most common values and their counts:
country
United States               15827
NULL                        15330
India                       14075
United States of America    13283
Germany                     10046
Name: count, dtype: int64

Total unique values (excluding nulls): 235
Number of null values: 15330
--------------------------------------------------


Column: country
Top 5 most common values and their counts:
country
United States               15827
NULL                        15330
India                       14075
United States of America 

In [27]:
# Mapping of variants -> canonical names (lowercased keys for matching)
country_map = {
    'united states': 'United States',
    'united states of america': 'United States',
    'united kingdom of great britain and northern ireland': 'United Kingdom',
    'united kingdom': 'United Kingdom',
    'trinidad and tobago': 'Trinidad and Tobago',
    'trinidad & tobago': 'Trinidad and Tobago',
    'syrian arab republic': 'Syria',
    'syria': 'Syria',
    'other country (not listed above)': 'Other',
    'other (please specify)': 'Other',
    'myanmar, {burma}': 'Myanmar',
    'myanmar': 'Myanmar',
    'libyan arab jamahiriya': 'Libya',
    'libya': 'Libya',
    'laos': 'Laos',
    "lao people's democratic republic": 'Laos',
    'korea south': 'South Korea',
    'republic of korea': 'South Korea',
    'south korea': 'South Korea',
    'korea north': 'North Korea',
    'north korea': 'North Korea',
    'ireland': 'Ireland',
    'ireland {republic}': 'Ireland',
    'hong kong (s.a.r.)': 'Hong Kong',
    'hong kong': 'Hong Kong',
    'guinea-bissau': 'Guinea',
    'guinea': 'Guinea',
    'bosnia herzegovina': 'Bosnia and Herzegovina',
    'bosnia and herzegovina': 'Bosnia and Herzegovina',
    'bosnia-herzegovina': 'Bosnia and Herzegovina',
    'vatican city state': 'Vatican',
    'vatican': 'Vatican',
    'viet nam': 'Vietnam',
    'vietnam': 'Vietnam'
}

In [28]:
# Standardize and map 'country' values in combined_consolidated_df
def standardize_country(val):
    # Preserve NaN/None as-is
    if pd.isna(val):
        return val
    # Normalize to lower-case stripped string for lookup
    key = str(val).strip().lower()
    # Return mapped canonical name if available, otherwise return original (preserve original casing)
    return country_map.get(key, val)

# Apply mapping (overwrites 'country' column if present)
if 'country' in combined_consolidated_df.columns:
    combined_consolidated_df['country'] = combined_consolidated_df['country'].apply(standardize_country)
    print("Mapped 'country' values using country_map. Sample counts:")
    # Print the top values (including NaN) to give a quick check
    print(combined_consolidated_df['country'].value_counts(dropna=False).head(20))
else:
    print("Warning: 'country' column not found in combined_consolidated_df")

Mapped 'country' values using country_map. Sample counts:
country
United States         30699
India                 14533
Germany               10309
NaN                   10042
United Kingdom         9382
Canada                 5367
France                 4451
Brazil                 3344
Poland                 3312
Netherlands            3184
Australia              3017
Italy                  2742
Spain                  2682
Russian Federation     2564
Sweden                 2163
Ukraine                2019
Switzerland            1672
Israel                 1467
Austria                1443
Turkey                 1429
Name: count, dtype: int64


In [29]:
# Remove columns from combined_consolidated_df with >70% null values
threshold = 0.7
null_pct = combined_consolidated_df.isna().mean()
cols_to_drop = null_pct[null_pct > threshold].index.tolist()
print(f"Dropping {len(cols_to_drop)} columns with >70% null values:")
print(cols_to_drop)
combined_consolidated_df.drop(columns=cols_to_drop, inplace=True)
print(f"New shape of combined_consolidated_df: {combined_consolidated_df.shape}")

Dropping 639 columns with >70% null values:
['expectedsalary', 'aitoolinterested in using', 'truefalse_1', 'sotagshaveworkedwith', 'toolstechwanttoworkwith', 'officestacksyncadmired', 'newcollabtoolsdesirenextyear', 'knowledge_7', 'ainextneither different nor similar', 'sexualorientation', 'adspriorities5', 'webframechoice', 'impsyn', 'embeddedhaveworkedwith', 'databasechoice', 'agree_legacy', 'officestacksyncwanttoworkwith', 'buynewtool', 'stackoverflowjobsrecommend', 'learncodeai', 'stackoverflowmetachat', 'checkincode', 'adspriorities1', 'influenceworkstation', 'assessjoboffice', 'newjobhunt', 'jobemailpriorities4', 'webframewantentry', 'which languages are you proficient in? (java)', 'annoyingui', 'stackoverflowjoblisting', 'aidangerous', 'commit_frequency', 'agree_loveboss', 'stackoverflowbetter', 'do you have a stack overflow careers 2.0 profile?', 'wantworkplatform', 'equipmentsatisfiedrw', 'wantworklanguage', 'assessbenefits7', 'welcomechange', 'difficultcommunication', 'techli

In [30]:
# Detailed column stats sorted by non-null count (descending)
col_stats = pd.DataFrame({
    'non_null_count': combined_consolidated_df.notna().sum(),
    'null_count': combined_consolidated_df.isna().sum(),
    'unique_count': combined_consolidated_df.nunique(dropna=True)
}).sort_values('non_null_count', ascending=False)


In [31]:
# Add percent of total rows for nulls (rounded to 2 decimals)
total_rows = combined_consolidated_df.shape[0]
col_stats['null_pct'] = (col_stats['null_count'] / total_rows * 100).round(2)

In [32]:
col_stats

Unnamed: 0,non_null_count,null_count,unique_count,null_pct
surveyyear,154521,0,15,0.0
country,144479,10042,229,6.5
employment,140846,13675,151,8.85
education,137278,17243,554,11.16
years_coding,128780,25741,156,16.66
age,125492,29029,121,18.79
programming_experience,116643,37878,126,24.51
developer_type,113405,41116,14745,26.61
company_size,108436,46085,38,29.82
survey_easy,107361,47160,18,30.52


### Consolidate values for 'gender'

In [33]:
combined_consolidated_df['gender'].value_counts()

gender
Man                                                                                   52647
Male                                                                                  29939
Woman                                                                                  3561
Female                                                                                 1947
Non-binary, genderqueer, or gender non-conforming                                       544
Prefer not to say                                                                       507
Prefer not to disclose                                                                  176
Man;Non-binary, genderqueer, or gender non-conforming                                   136
Woman;Non-binary, genderqueer, or gender non-conforming                                 135
Or, in your own words:                                                                  133
Other                                                                    

In [37]:
# Keep the portion before the first semicolon in df_final['gender'] and convert to lower case
def keep_before_semicolon(val):
    if pd.isna(val):
        return val
    if isinstance(val, str):
        return val.split(';', 1)[0].strip()
    return val

combined_consolidated_df['gender_raw'] = combined_consolidated_df['gender'].apply(keep_before_semicolon)
combined_consolidated_df['gender_raw'] = combined_consolidated_df['gender_raw'].str.lower()

# Quick check
print(combined_consolidated_df['gender_raw'].value_counts(dropna=False).head(20))

gender_raw
NaN                                                  64233
man                                                  52911
male                                                 30028
woman                                                 3759
female                                                2035
non-binary, genderqueer, or gender non-conforming      550
prefer not to say                                      507
prefer not to disclose                                 176
or, in your own words:                                 137
other                                                  111
transgender                                             47
gender non-conforming                                   27
Name: count, dtype: int64


In [43]:
# Exact mapping for common raw values (keys are lowercased/stripped)
exact_map = {
    'nan': None,  # placeholder, handled below
    'man': 'man',
    'male': 'man',
    'woman': 'woman',
    'female': 'woman',
    'non-binary, genderqueer, or gender non-conforming': 'non-binary',
    'gender non-conforming': 'non-binary',
    'transgender': 'transgender',
    'prefer not to say': 'prefer_not_to_say',
    'prefer not to disclose': 'prefer_not_to_say',
    'or, in your own words:': 'other',
    'other': 'other'
}

def apply_exact_map(val):
    # Preserve NaN as unknown
    if pd.isna(val):
        return 'unknown'
    key = str(val).strip().lower()
    return exact_map.get(key, None)  # None if no exact mapping

# Create exact mapping column
combined_consolidated_df['gender_update'] = combined_consolidated_df['gender_raw'].apply(apply_exact_map)

# Quick summary counts
print("Exact-mapped counts:")
print(combined_consolidated_df['gender_update'].value_counts(dropna=False))

Exact-mapped counts:
gender_update
man                  82939
unknown              64233
woman                 5794
prefer_not_to_say      683
non-binary             577
other                  248
transgender             47
Name: count, dtype: int64


In [44]:
# extract surveyyear and country and make a counts matrix
cols = ['surveyyear', 'gender_update']
df_sub = combined_consolidated_df[cols].copy()

# drop rows missing either value
#df_sub = df_sub.dropna(subset=['surveyyear', 'country'])

# ensure surveyyear is treated consistently (optional)
# df_sub['surveyyear'] = df_sub['surveyyear'].astype(str)

# Create matrix: rows = surveyyear, cols = country, values = counts
matrix_df = pd.crosstab(df_sub['surveyyear'], df_sub['gender_update']).sort_index()

# display and optionally save
display(matrix_df)
matrix_df.to_csv('surveyyear_by_gender_mapped_final_matrix.csv')

gender_update,man,non-binary,other,prefer_not_to_say,transgender,unknown,woman
surveyyear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,0,0,0,0,0,563,0
2012,0,0,0,0,0,1249,0
2013,0,0,0,0,0,1948,0
2014,1364,0,0,23,0,74,68
2015,0,0,0,0,0,5217,0
2016,10291,0,58,153,0,85,619
2017,6357,27,53,0,17,3294,530
2018,12016,53,0,0,30,6854,818
2019,15598,115,0,0,0,695,1369
2020,9219,72,0,0,0,2771,830


In [None]:
# Clean up gender columns: drop old versions and rename gender_update to gender
columns_to_drop = ['gender', 'gender_raw', 'gender_mapped_exact']
existing_cols = [col for col in columns_to_drop if col in combined_consolidated_df.columns]

if existing_cols:
    combined_consolidated_df = combined_consolidated_df.drop(columns=existing_cols)
    print(f"Dropped columns: {existing_cols}")

if 'gender_update' in combined_consolidated_df.columns:
    combined_consolidated_df = combined_consolidated_df.rename(columns={'gender_update': 'gender'})
    print("Renamed 'gender_update' to 'gender'")

print(f"Final shape: {combined_consolidated_df.shape}")

### years_coding

In [None]:
def clean_years_coding(val):
    """
    Clean years_coding values:
    - Convert ranges (e.g., '9-11 years') to median
    - Extract single numeric values
    - Drop values >100
    - Return NaN for non-numeric values
    """
    if pd.isna(val):
        return None
    
    val_str = str(val).strip().lower()
    
    # Extract numbers from ranges (e.g., "9-11 years" or "9-11")
    range_match = re.search(r'(\d+)\s*[-–—to]\s*(\d+)', val_str)
    if range_match:
        start = float(range_match.group(1))
        end = float(range_match.group(2))
        median_val = (start + end) / 2
        return median_val if median_val <= 100 else None
    
    # Extract single number (e.g., "5 years" or "5")
    single_match = re.search(r'(\d+(?:\.\d+)?)', val_str)
    if single_match:
        num_val = float(single_match.group(1))
        return num_val if num_val <= 100 else None
    
    # No numeric value found
    return None

# Apply cleaning function
combined_consolidated_df['years_coding_clean'] = combined_consolidated_df['years_coding'].apply(clean_years_coding)

# Show comparison
print(f"Min: {combined_consolidated_df['years_coding_clean'].min()}")
print(f"Max: {combined_consolidated_df['years_coding_clean'].max()}")
print(f"Mean: {combined_consolidated_df['years_coding_clean'].mean():.2f}")
print(f"Median: {combined_consolidated_df['years_coding_clean'].median()}")

Min: 1.0
Max: 100.0
Mean: 12.04
Median: 10.0


In [None]:
# Drop the old years_coding column and rename years_coding_clean
combined_consolidated_df = combined_consolidated_df.drop(columns=['years_coding'])
combined_consolidated_df = combined_consolidated_df.rename(columns={'years_coding_clean': 'years_coding'})

print(f"Updated columns. New shape: {combined_consolidated_df.shape}")
print(f"\nYears coding column summary:")
print(f"  Non-null count: {combined_consolidated_df['years_coding'].notna().sum()}")
print(f"  Min: {combined_consolidated_df['years_coding'].min()}")
print(f"  Max: {combined_consolidated_df['years_coding'].max()}")
print(f"  Mean: {combined_consolidated_df['years_coding'].mean():.2f}")

Updated columns. New shape: (151564, 47)

Years coding column summary:
  Non-null count: 125509
  Min: 1.0
  Max: 100.0
  Mean: 12.04


### Consolidate Education

In [None]:
def standardize_education(val):
    """
    Standardize education values:
    - bachelor's -> bachelors
    - master's -> masters
    - on-the-job training variations -> on_the_job_training
    """
    if pd.isna(val):
        return val
    
    val_str = str(val).strip().lower()
    
    # Check for bachelor's variations
    if "bachelor" in val_str or "b.a" in val_str or "b.s." in val_str or "b.s" in val_str:
        return "bachelors"
    
    # Check for master's variations
    if "master" in val_str:
        return "masters"
    
    # Check for on-the-job training variations
    if "self-taught" in val_str or "online class" in val_str or "self taught" in val_str:
        return "self-taught"

    # Check for on-the-job training variations
    if "on-the-job" in val_str or "on the job" in val_str or "job training" in val_str:
        return "on the job training"

    # Check for on-the-job training variations
    if "some college" in val_str:
        return "some college"
    
    # Check for on-the-job training variations
    if "primary" in val_str or "secondary" in val_str:
        return "less than college"

    # Check for on-the-job training variations
    if "associate" in val_str:
        return "associate degree"
    
    # Check for on-the-job training variations
    if "md" in val_str or "doctor" in val_str or "phd" in val_str or "doctoral" in val_str or "jd" in val_str:
        return "doctoral degree"
    
    # Check for on-the-job training variations
    if "full-time, intensive" in val_str or "part-time program" in val_str or "industry certification" in val_str or "mentorship program" in val_str or "something else" in val_str or "other" in val_str:
        return "other program"
    
    # Check for on-the-job training variations
    if "i prefer not to say" in val_str or "i prefer not to answer" in val_str:
        return "i prefer not to say"
    
    # Check for on-the-job training variations
    if "i never completed any formal education" in val_str:
        return "none"
    
    # Return original value if no match
    return val

# Apply standardization to education column
combined_consolidated_df['education_clean'] = combined_consolidated_df['education'].apply(standardize_education)

# Create a pivot-style DataFrame from education value counts
education_counts = combined_consolidated_df['education_clean'].value_counts().reset_index()
education_counts.columns = ['Education Level', 'Count']

# Add percentage column
total = education_counts['Count'].sum()
education_counts['Percentage'] = (education_counts['Count'] / total * 100).round(2)

# Display the formatted table
display(education_counts)

NameError: name 'combined_consolidated_df' is not defined

In [None]:
# Drop the old education column and rename education_clean to education
combined_consolidated_df = combined_consolidated_df.drop(columns=['education'])
combined_consolidated_df = combined_consolidated_df.rename(columns={'education_clean': 'education'})

print(f"Updated columns. New shape: {combined_consolidated_df.shape}")
print(f"\nEducation column summary:")
print(f"  Non-null count: {combined_consolidated_df['education'].notna().sum()}")
print(f"\nValue counts:")
print(combined_consolidated_df['education'].value_counts())

Updated columns. New shape: (151564, 46)

Education column summary:
  Non-null count: 137278

Value counts:
education
bachelors              61140
masters                30519
less than college      15207
some college           15080
doctoral degree         5710
associate degree        3675
self-taught             3672
other program           1271
none                     432
i prefer not to say      243
on the job training      192
Professional degree      137
Name: count, dtype: int64


### Consolidate age

In [4]:
combined_consolidated_df['age'].value_counts()

age
25-34 years old      26873
18-24 years old      15779
35-44 years old      15633
25 - 34 years old     6420
45-54 years old       6242
                     ...  
21.5                     1
3.0                      1
37.5                     1
16.9                     1
76.0                     1
Name: count, Length: 191, dtype: int64

In [9]:
def clean_age(val):
    """
    Clean age values:
    - Convert ranges (e.g., '25-34 years old') to median
    - Extract single numeric values
    - Handle special cases like 'Under 18' or '65 or older'
    - Force any values under 12 to be 12
    - Return NaN for non-numeric values
    """
    if pd.isna(val):
        return None
    
    val_str = str(val).strip().lower()
    result = None
    
    # Handle special cases
    if 'under' in val_str or 'less than' in val_str:
        # Extract the number after 'under' or 'less than'
        match = re.search(r'(\d+)', val_str)
        if match:
            result = float(match.group(1)) - 1  # e.g., "Under 18" -> 17
    
    elif 'older' in val_str or 'over' in val_str or 'above' in val_str:
        # Extract the number before 'older', 'over', or 'above'
        match = re.search(r'(\d+)', val_str)
        if match:
            result = float(match.group(1))  # e.g., "65 or older" -> 65
    
    # Extract numbers from ranges (e.g., "25-34" or "25 - 34")
    elif (range_match := re.search(r'(\d+)\s*[-–—to]\s*(\d+)', val_str)):
        start = float(range_match.group(1))
        end = float(range_match.group(2))
        result = (start + end) / 2
    
    # Extract single number (e.g., "25 years old" or "25")
    elif (single_match := re.search(r'(\d+)', val_str)):
        result = float(single_match.group(1))
    
    # Force minimum age of 12
    if result is not None and result < 12:
        result = 12
    
    return result

# Apply cleaning function to combined_consolidated_df
combined_consolidated_df['age_clean'] = combined_consolidated_df['age'].apply(clean_age)

# Show statistics
print(f"Age Statistics:")
print(f"Min: {combined_consolidated_df['age_clean'].min()}")
print(f"Max: {combined_consolidated_df['age_clean'].max()}")
print(f"Mean: {combined_consolidated_df['age_clean'].mean():.2f}")
print(f"Median: {combined_consolidated_df['age_clean'].median()}")
print(f"\nValue counts:")
print(combined_consolidated_df['age_clean'].value_counts().sort_index().head(50))

Age Statistics:
Min: 12.0
Max: 99.0
Mean: 31.65
Median: 29.5

Value counts:
age_clean
12.0       23
13.0       44
14.0       62
15.0      132
16.0      174
17.0     3748
18.0      388
19.0      484
20.0     1533
21.0    19600
22.0     3985
23.0     1244
24.0     1389
25.0     1451
26.0     1383
27.0     4740
28.0     1332
29.0     1258
29.5    33293
30.0     1292
31.0      954
32.0     3426
33.0      893
34.0      756
35.0      787
36.0      583
37.0     2154
38.0      569
39.0      416
39.5    17957
40.0      451
41.0      296
42.0      362
43.0      288
44.0      231
44.5     1019
45.0      639
46.0      191
47.0      163
48.0      156
49.0      143
49.5     6942
50.0      141
51.0      103
52.0      103
53.0       90
54.0       82
54.5      307
55.0       78
55.5       93
Name: count, dtype: int64


In [None]:
# Drop the old age column and rename age_clean to age
combined_consolidated_df = combined_consolidated_df.drop(columns=['age'])
combined_consolidated_df = combined_consolidated_df.rename(columns={'age_clean': 'age'})

print(f"Updated columns. New shape: {combined_consolidated_df.shape}")
print(f"\nAge column summary:")
print(f"  Non-null count: {combined_consolidated_df['age'].notna().sum()}")
print(f"  Min: {combined_consolidated_df['age'].min()}")
print(f"  Max: {combined_consolidated_df['age'].max()}")
print(f"  Mean: {combined_consolidated_df['age'].mean():.2f}")
print(f"  Median: {combined_consolidated_df['age'].median()}")

### Consolidate Developer Type

In [10]:
df['developer_type'].value_counts()

developer_type
Developer, full-stack                                                                                                                                       17062
Developer, back-end                                                                                                                                          9755
Developer, front-end                                                                                                                                         3772
Student                                                                                                                                                      3292
Developer, mobile                                                                                                                                            2539
                                                                                                                                                            ...  
Developer, ba

In [36]:
# Keep only the portion before the first semicolon in developer_type and standardize values
def clean_developer_type(val):
    """
    Extract the first developer type before semicolon and standardize common variations.
    E.g., 'Full-stack developer;Back-end developer' -> 'full-stack developer'
    E.g., 'Developer, full-stack' -> 'full-stack developer'
    """
    if pd.isna(val):
        return val
    if isinstance(val, str):
        # Take only the first value before semicolon
        first_type = val.split(';', 1)[0].strip().lower()
        
        # Standardize full-stack variations
        if 'full-stack' in first_type or 'full stack' in first_type:
            return 'full-stack developer'

        if 'front-end' in first_type or 'front end' in first_type:
            return 'front-end developer'

        if 'back-end' in first_type or 'back end' in first_type:
            return 'back-end developer'
        
        if 'mobile' in first_type or 'architect' in first_type or 'graphics' in first_type or 'devops' in first_type or 'web' in first_type or 'developer' in first_type:
            return 'other developer'

        if 'data scien' in first_type or 'machine learning' in first_type or 'ML' in first_type or 'DS' in first_type :
            return 'data scientist'

        if 'data' in first_type :
            return 'data role'

        if 'engineer' in first_type :
            return 'engineering role'
        
        if 'research' in first_type or 'scientist' in first_type:
            return 'research role'
        
        if 'systems administrator' in first_type or 'system administrator' in first_type:
            return 'systems administrator'
        
        if 'executive' in first_type :
            return 'executive'
        
        if 'design' in first_type :
            return 'design'
        
        if 'security' in first_type or 'blockchain' in first_type:
            return 'cyber security'

        if 'product manager' in first_type or 'project manager' in first_type:
            return 'product/project manager'
        
        if 'other' in first_type :
            return 'other role'

        # Return the cleaned value
        return first_type
    return val

# Apply cleaning to df (which should be combined_consolidated_df based on context)
combined_consolidated_df['developer_type_clean'] = combined_consolidated_df['developer_type'].apply(clean_developer_type)

# Show value counts
print("Developer Type (cleaned) - Top 20:")
print(combined_consolidated_df['developer_type_clean'].value_counts().head(50))

Developer Type (cleaned) - Top 20:
developer_type_clean
back-end developer                 29209
full-stack developer               23102
other developer                    22286
front-end developer                10418
data role                           6844
research role                       3964
student                             3488
data scientist                      3362
engineering role                    2701
other role                          2573
design                              2308
executive                           1011
product/project manager              646
systems administrator                635
cyber security                       358
educator                             290
marketing or sales professional      121
retired                               89
Name: count, dtype: int64


### Export df_final

In [None]:
df_final = combined_consolidated_df.copy()

In [None]:
df_final.to_csv('df_final.csv') 

In [None]:
#df = pd.read_csv('df_final.csv')