In [1]:
import os
import time
import requests
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# Configuration
MAX_RETRIES = 10
RETRY_DELAY = 30  # seconds
SURVEY_YEARS = list(range(2011, 2026))  # 2011 to 2025

print("Starting Stack Overflow Survey Data Download...")
print(f"Years to download: {min(SURVEY_YEARS)} to {max(SURVEY_YEARS)}")
print(f"Retry configuration: {MAX_RETRIES} max attempts, {RETRY_DELAY}s delay")
print(f"URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{{year}}.zip\n")


Starting Stack Overflow Survey Data Download...
Years to download: 2011 to 2025
Retry configuration: 10 max attempts, 30s delay
URL pattern: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip



In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
def get_survey_urls(year):
    """
    Generate URL for a given survey year.
    All years use the same datasets ZIP pattern.
    """
    # All years use the same URL pattern
    url = f"https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip"
    return [url]

def fix_headers_for_older_years(df, year):
    """
    Fix headers for years 2011-2016 where the first two rows are headers.
    If the second row says "Response", use only the first row value.
    Otherwise, combine the first two rows.
    """
    if year >= 2011 and year <= 2015:
        if df.shape[0] < 2:
            print(f"  Warning: Not enough rows to fix headers for year {year}")
            return df
        
        # Get the first two rows
        first_row = df.iloc[0].astype(str)
        second_row = df.iloc[1].astype(str)
        
        # Create new column names
        new_columns = []
        for i, (first_val, second_val) in enumerate(zip(first_row, second_row)):
            first_val = first_val.strip()
            second_val = second_val.strip()
            
            # If second row is "Response", just use first row value
            if second_val == "Response":
                new_columns.append(first_val)
            else:
                # If both are the same or second is empty, use first
                if first_val == second_val or second_val == "":
                    new_columns.append(first_val)
                else:
                    # Combine both values - first value as primary
                    new_columns.append(f"{first_val} ({second_val})")
        
        # Set new column names
        df.columns = new_columns
        
        # Drop the first two rows (header rows)
        df = df.iloc[2:].reset_index(drop=True)
        
        print(f"  ✓ Fixed headers for year {year} (combined first two rows, removed 2 header rows)")
    
    return df

def download_file(url, year):
    """
    Download a file (single attempt, no retries).
    Returns the content if successful, None otherwise.
    Handles both CSV and ZIP files.
    """
    try:
        print(f"  Trying URL: {url}")
        response = requests.get(url, timeout=60, stream=True)
        response.raise_for_status()
        
        # Check content type
        content_type = response.headers.get('content-type', '').lower()
        
        # Reject HTML responses (likely error pages)
        if 'html' in content_type and response.status_code == 200:
            # Might be an error page, try next URL pattern
            print(f"  Warning: Received HTML instead of data file, may be wrong URL")
            return None
        
        content = response.content
        # Basic validation: check if content looks reasonable
        if len(content) < 100:
            print(f"  Warning: File too small, may be error page")
            return None
        
        # Check if it's a ZIP file by magic bytes
        is_zip = content[:2] == b'PK'  # ZIP files start with PK
        if is_zip:
            print(f"  ✓ Successfully downloaded {year} as ZIP ({len(content):,} bytes)")
        else:
            print(f"  ✓ Successfully downloaded {year} ({len(content):,} bytes)")
        
        return content
        
    except requests.exceptions.RequestException as e:
        print(f"  ✗ Download error: {str(e)}")
        return None


In [4]:
def validate_url(url):
    """
    Validate if a URL exists without downloading the full content.
    Returns True if the URL is valid and returns proper headers.
    """
    try:
        # Only get headers to check existence
        response = requests.head(url, timeout=10)
        return response.status_code == 200 and 'content-length' in response.headers
    except requests.exceptions.RequestException:
        return False

In [5]:
def download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY, sample_size=None):
    """
    Download and extract survey data for a given year with retry logic.
    Tries multiple URL patterns and handles both CSV and ZIP files.
    Wraps the entire process in retry logic to catch any runtime errors.
    
    Args:
        year: The survey year to download
        max_retries: Maximum number of retry attempts
        delay: Delay between retries in seconds
        sample_size: If provided, only read this many rows from the CSV (for testing/development)
    
    Returns:
        DataFrame if successful, None otherwise.
    """
    print(f"\n{'='*60}")
    print(f"Processing year {year}")
    print(f"{'='*60}")
    
    urls = get_survey_urls(year)
    
    # Outer retry loop for entire download/extract process
    # This will retry the entire process up to max_retries times if a RuntimeError occurs
    for retry_attempt in range(max_retries):
        try:
            # Try each URL pattern
            for url in urls:
                content = download_file(url, year)
                
                if content is None:
                    continue
                
                # Check if content is a ZIP file by magic bytes (ZIP files start with 'PK')
                is_zip = content[:2] == b'PK'
                
                if is_zip:
                    # Try to parse as ZIP
                    try:
                        with ZipFile(BytesIO(content)) as zip_file:
                            # Look for CSV files in the ZIP (exclude macOS metadata)
                            csv_files = [f for f in zip_file.namelist() 
                                       if f.endswith('.csv') and not f.startswith('__MACOSX/')]
                            if csv_files:
                                # Use the first CSV file found
                                csv_file = csv_files[0]
                                print(f"  Found CSV file in ZIP: {csv_file}")
                                with zip_file.open(csv_file) as f:
                                    # For years 2011-2016, read without header to fix manually
                                    read_kwargs = {
                                        'low_memory': False, 
                                        'on_bad_lines': 'skip',
                                        'nrows': sample_size  # Add sample size parameter
                                    }
                                    if year >= 2011 and year <= 2015:
                                        read_kwargs['header'] = None
                                    
                                    try:
                                        df = pd.read_csv(f, encoding='utf-8', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                                    except UnicodeDecodeError:
                                        f.seek(0)
                                        df = pd.read_csv(f, encoding='latin-1', **read_kwargs)
                                        print(f"  ✓ Successfully loaded {year} from ZIP with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                                        # Fix headers for older years
                                        df = fix_headers_for_older_years(df, year)
                                        return df
                            else:
                                print(f"  No CSV files found in ZIP archive")
                    except Exception as e:
                        print(f"  ZIP parsing failed: {str(e)}")
                        continue
                else:
                    # Try to parse as CSV directly
                    # For years 2011-2016, read without header to fix manually
                    read_kwargs = {
                        'low_memory': False, 
                        'on_bad_lines': 'skip',
                        'nrows': sample_size  # Add sample size parameter
                    }
                    if year >= 2011 and year <= 2016:
                        read_kwargs['header'] = None
                    
                    try:
                        df = pd.read_csv(BytesIO(content), encoding='utf-8', **read_kwargs)
                        print(f"  ✓ Successfully loaded {year} as CSV ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                        # Fix headers for older years
                        df = fix_headers_for_older_years(df, year)
                        return df
                    except UnicodeDecodeError:
                        # Try different encoding
                        try:
                            df = pd.read_csv(BytesIO(content), encoding='latin-1', **read_kwargs)
                            print(f"  ✓ Successfully loaded {year} as CSV with latin-1 encoding ({df.shape[0]:,} rows, {df.shape[1]:,} cols)")
                            # Fix headers for older years
                            df = fix_headers_for_older_years(df, year)
                            return df
                        except Exception as e:
                            print(f"  CSV parsing failed: {str(e)}")
                            continue
                    except Exception as e:
                        print(f"  CSV parsing failed: {str(e)}")
                        continue
            
            # If we get here, the URL failed - this triggers a retry if attempts remain
            if retry_attempt < max_retries - 1:
                print(f"  ✗ Download failed for year {year}")
                print(f"  Retrying entire process (attempt {retry_attempt + 2}/{max_retries}) in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed to download and extract data for year {year} after {max_retries} attempts")
                print(f"  URL attempted: {urls[0]}")
                return None
                
        except RuntimeError as e:
            print(f"  ✗ Runtime error on attempt {retry_attempt + 1}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts due to runtime error")
                return None
        except Exception as e:
            # Catch any other unexpected errors and retry
            print(f"  ✗ Unexpected error on attempt {retry_attempt + 1}: {type(e).__name__}: {str(e)}")
            if retry_attempt < max_retries - 1:
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ✗ Failed after {max_retries} attempts")
                return None
    
    return None

In [6]:
# Define sample size for testing (set to None for full dataset)
SAMPLE_SIZE = 1000  # Adjust this value to control how many rows to read from each year

# Download and create dataframes for each year
dataframes = {}

for year in SURVEY_YEARS:
    df = download_and_extract_year(year, max_retries=MAX_RETRIES, delay=RETRY_DELAY)
    if df is not None:
        # Add a year column to track which year the data is from
        df['SurveyYear'] = year
        dataframes[year] = df
    else:
        print(f"⚠ Skipping year {year} - download failed")

print(f"\n{'='*60}")
print(f"Download Summary")
print(f"{'='*60}")
print(f"Successfully downloaded: {len(dataframes)} out of {len(SURVEY_YEARS)} years")
print(f"Years downloaded: {sorted(dataframes.keys())}")
print(f"Years failed: {[y for y in SURVEY_YEARS if y not in dataframes]}")

# Display info for each dataframe
if dataframes:
    print(f"\n{'='*60}")
    print(f"DataFrame Information")
    print(f"{'='*60}")
    for year, df in sorted(dataframes.items()):
        print(f"Year {year}: {df.shape[0]:,} rows × {df.shape[1]:,} columns")


Processing year 2011
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2011.zip
  ✓ Successfully downloaded 2011 as ZIP (80,173 bytes)
  Found CSV file in ZIP: 2011 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2011 from ZIP with latin-1 encoding (2,815 rows, 65 cols)
  ✓ Fixed headers for year 2011 (combined first two rows, removed 2 header rows)

Processing year 2012
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2012.zip
  ✓ Successfully downloaded 2012 as ZIP (266,621 bytes)
  Found CSV file in ZIP: 2012 Stack Overflow Survey Results.csv
  ✓ Successfully loaded 2012 from ZIP with latin-1 encoding (6,245 rows, 75 cols)
  ✓ Fixed headers for year 2012 (combined first two rows, removed 2 header rows)

Processing year 2013
  Trying URL: https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2013.zip
  ✓ Successfully downloaded 2013 as ZIP (689,493 bytes)
  Found CSV file in ZIP: 

In [7]:
# Create combined dataframe from all years
if dataframes:
    print(f"\n{'='*60}")
    print(f"Creating Combined DataFrame")
    print(f"{'='*60}")
    
    # Ensure all dataframes have unique columns before concatenation
    # Find the union of all columns
    all_columns = set()
    for df in dataframes.values():
        all_columns.update(df.columns)
    all_columns = list(all_columns)

    # Reindex each dataframe to ensure unique columns for concat
    aligned_dfs = []
    for year, df in dataframes.items():
        # Remove duplicate columns if any (can happen on bad CSVs)
        df = df.loc[:,~df.columns.duplicated()]
        aligned_df = df.reindex(columns=all_columns)
        aligned_dfs.append(aligned_df)
    
    combined_df = pd.concat(aligned_dfs, ignore_index=True, sort=False)
    
    print(f"success!")
    print(f"  rows: {combined_df.shape[0]:,}")
    print(f"  columns: {combined_df.shape[1]:,}")
    print(f"  Years: {sorted(combined_df['SurveyYear'].dropna().unique())}")
    
    # Show basic info about the combined dataframe
    print(f"\n{'='*60}")
    print(f"Combined DataFrame Info")
    print(f"{'='*60}")
    print(combined_df.info())
    print(f"{'='*60}")
else:
    print("\n⚠ No dataframes were successfully downloaded. Cannot create combined dataframe.")
    combined_df = None



Creating Combined DataFrame
success!
  rows: 772,599
  columns: 1,087
  Years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772599 entries, 0 to 772598
Columns: 1087 entries, nan (Who do you want to communicate with about a new job opportunity: Developer) to TechOppose_1
dtypes: float64(106), int64(1), object(980)
memory usage: 6.3+ GB
None


In [8]:
# Access individual year dataframes: dataframes[year]
# Access combined dataframe: combined_df
# Example:
if dataframes:
    print(f"\n{'='*60}")
    print(f"How to Access Your Data")
    print(f"{'='*60}")
    print(f"Individual year dataframes:")
    print(f"  - dataframes[2024]  # Access 2024 data")
    print(f"  - dataframes[2023]  # Access 2023 data")
    print(f"  - etc.")
    print(f"\nCombined dataframe:")
    print(f"  - combined_df  # All years combined")
    print(f"\nAvailable years: {sorted(dataframes.keys())}")
    
    # Quick preview of the combined dataframe
    if combined_df is not None:
        print(f"\n{'='*60}")
        print(f"Combined DataFrame Preview (first 5 rows)")
        print(f"{'='*60}")
        print(combined_df.head())



How to Access Your Data
Individual year dataframes:
  - dataframes[2024]  # Access 2024 data
  - dataframes[2023]  # Access 2023 data
  - etc.

Combined dataframe:
  - combined_df  # All years combined

Available years: [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Combined DataFrame Preview (first 5 rows)
  nan (Who do you want to communicate with about a new job opportunity: Developer)  \
0                                                                              NaN   
1                                                                              NaN   
2                                                                              NaN   
3                                                                              NaN   
4                                                                              NaN   

   ToolCountPersonal nan (Perception of contact form: Stack Overflow Careers)  \
0                NaN                           

In [9]:
combined_df.head()

Unnamed: 0.1,nan (Who do you want to communicate with about a new job opportunity: Developer),ToolCountPersonal,nan (Perception of contact form: Stack Overflow Careers),NEWCollabToolsHaveWorkedWith,TechEndorse_13_TEXT,NumberMonitors,VCHostingProfessional use,Select up to 3 (Appealing message traits: Message is personalized),JobEmailPriorities3,What advertisers do you remember seeing on Stack Overflow? (Open-Ended Response),nan (Current Lang & Tech: Cloud),Select all that apply (Future Lang & Tech: Android),nan (PhoneGap),nan (Occupation),Country,"nan (Training & Education: Some college, but no CS degree)",InfluenceVizTools,TrueFalse_3,AIDangerous,TechOppose_16,LastNewJob,AINextVery similar,OfficeStackAsyncAdmired,JobContactPriorities2,Blockchain,AIModelsChoice,ImportantHiringGettingThingsDone,Student,nan (Appealing message traits: Salary information),AssessJobExp,agree_notice,Methodology,NEWOnboardGood,EquipmentSatisfiedRW,nan (Go),OpenSourcer,AssessBenefits6,nan (Current Lang & Tech: Write-In),nan (Kindle),un_subregion,OtherPeoplesCode,StackOverflowConsiderMember,Accessibility,age_range,AdsAgreeDisagree3,JobSatisfaction,SO_Actions_3,What other departments / roles do you interact with regularly? (System Administrators),nan (Software as a service / recurring billing),AIAgentExtWrite,"In receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (Message is personalized to me)",SOHowMuchTime,LanguageDesireNextYear,JobSatPoints_6,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (I can recommend or request products),nan (Appealing message traits: Team described),nan (Who do you want to communicate with about a new job opportunity: In-house recruiter),Unnamed: 0,nan (Most important aspect of new job opportunity: Work - Life balance),CurrencyDesc,Select up to 3 (Most important aspect of new job opportunity: Salary),FrameworkDesireNextYear,occupation_group,"nan ($75,001 - $100,000)",AIResponsible,nan (Current Lang & Tech: LAMP),Sexuality,nan (iPad),AIToolCurrently Using,AdsPriorities1,AIAgents,AIAgentExternal,nan (Most important aspect of new job opportunity: Quality of colleagues),nan (Boxee),nan (Most urgent info about job opportunity: Benefits),SurveyTooLong,AgentUsesGeneral,BlockchainOrg,nan (Build my online reputation),nan (Current Lang & Tech: JavaScript),AINextMore integrated,"nan ($25,001 - $40,000)",nan (Why answer: Help future programmers),HomeRemote,CodeRevHrs,TimeSearching,ResumePrompted,DatabaseWantToWorkWith,nan (Future Lang & Tech: Cassandra),nan (Mentions my code or Stack Overflow activity),nan (How can companies improve interview process: Remote interviews),ConvertedSalary,AssessJob2,EquipmentSatisfiedStorage,QuestionsInteresting,nan (Most urgent info about job opportunity: Job title),What Country do you live in?,WakeTime,SOAccount,QuestionsConfusing,AssessBenefits1,nan (Training & Education: Masters in CS),TechDoc,Which of the following best describes your occupation?,nan (Future Lang & Tech: Scala),AIAgentImpactSomewhat agree,PlatformHaveWorkedWith,Frequency_3,CousinEducation,MiscTechAdmired,ClickyKeys,nan (Gender),nan (Future Lang & Tech: CoffeeScript),ImportantHiringOpenSource,AdsPriorities4,SODuration,OfficeStackWantEntry,nan (How can companies improve interview process: Gimme coffee),nan (Testers / Quality Assurance),nan (Current Lang & Tech: Scala),nan (Current Lang & Tech: Clojure),so_region,LanguageChoice,AIModelsAdmired,InfluenceWorkstation,SurveyEasy,MgrWant,nan (Designers),nan (Other Smart Phone),nan (Current Lang & Tech: Cordova),important_companymission,nan (Ruby),nan (Don't know),nan (Current Lang & Tech: Salesforce),Respondent,agree_alcohol,AssessJobProfDevel,How large is the team that you work on?,TechOppose_9,nan (WinRT),OfficeStackHaveEntry,HypotheticalTools2,SeriousWork,AssessBenefits4,tech_do,nan (Why answer: No idea),Have you visited / Are you aware of Stack Overflow Careers 2.0?,nan (Other media streaming device),PlatformWorkedWith,AdsPriorities3,LearnCode,JobSatPoints_5,TechEndorse_7,AIToolNot interested in Using,nan (40 hour work week),StackOverflowRecommend,important_wfh,AIAgentKnowledge,nan (Direct sales to companies),nan (Future Lang & Tech: AngularJS),aliens,nan (Opportunity to Use/Learn New Technologies),collector,RightWrongWay,AssessBenefits9,nan (Current Lang & Tech: MongoDB),nan (Ask questions to solve problems),AINextNo change,NEWCollabToolsAdmired,DevEnvsChoice,nan (Source control used: DCVS),StackOverflowJobListing,rep_range,TechOppose_15_TEXT,ToolCountWork,PlatformWantEntry,StackOverflowSatisfaction,age_midpoint,StackOverflowFoundAnswer,HopeFiveYears,AssessBenefits5,AINextLess integrated,AssessBenefits10,nan (Most important aspect of new job opportunity: Building something that matters),unit_testing,RaceEthnicity,What Country or Region do you live in?,nan (Employment Status),NEWCollabToolsDesireNextYear,Dependents,AIOpen,AdsPriorities7,nan (Current Lang & Tech: SQL),DevEnviron,JobContactPriorities1,SOFindAnswer,CareerSat,CommunicationTools,nan (Technical support),nan (Future Lang & Tech: Swift),nan (Training & Education: PhD in CS),nan (Most important aspect of new job opportunity: Industry),nan (Future Lang & Tech: Visual Basic),agree_diversity,DatabaseHaveEntry,WebframeChoice,WebframeWantEntry,nan (Android),nan (Android phone),nan (Most urgent info about job opportunity: Colleagues),Which US State or Territory do you live in?,nan (Software),HaveWorkedFramework,nan (Why try Stack Overflow Careers: Other),NEWCollabToolsWorkedWith,nan (Future Lang & Tech: MongoDB),NEWPurpleLink,AIModelsHaveEntry,DatabaseWorkedWith,Select all that apply (Source control used: Git),Which of our sites do you frequent most?,nan (Windows Phone),nan (Current Lang & Tech: iOS),AIToolCurrently partially AI,WebDeveloperType,nan (Dart),education,OfficeStackSyncWantToWorkWith,JobSatPoints_1,ShipIt,AdBlockerDisable,AIAgentOrchestration,nan (Current Lang & Tech: R),SO_Dev_Content,nan (Twitter),nan (Most important aspect of new job opportunity: Company size),TechEndorse_4,SOJobs,What type of project are you developing?,nan (Looking for a new job),AISent,SexualOrientation,nan (How can companies improve interview process: Flexible interview schedule),interview_likelihood,EthicsChoice,Were you aware of the Apptivate contest?,AnnoyingUI,nan (I'm a Seller),DatabaseAdmired,nan (Future Lang & Tech: LAMP),AIFrustration,ResponseId,nan (Redis),AssessBenefits2,nan (Why use Stack Overflow: Love to learn),nan (Other),visit_frequency,BuildingThings,ICorPM,InfluenceConsultants,BetterLife,EducationImportant,AgreeDisagree3,PlatformChoice,TechOppose_11,nan (Office in a Desirable City/Area),nan (Current Lang & Tech: Node.js),nan (Most important aspect of new job opportunity: Company reputation),nan (Room for Growth of Skills/Knowledge),"In the last 12 months, how much money have you spent on personal technology-related purchases?",nan (Answer questions I know the answer to),AILearnHow,CommPlatformWantEntr,MainBranch,nan (Training & Education: BS in CS),AssessJobDept,OfficeStackAsyncHaveWorkedWith,WantWorkDatabase,nan (Human Resources),CompTotal,nan (Future Lang & Tech: Ruby),nan (Job Satisfaction),important_ownoffice,How many years of IT/Programming experience do you have?,CareerSatisfaction,SO_Actions_15_TEXT,AIToolDon't plan to use AI for this task,CommPlatformHaveWorkedWith,nan (Future Lang & Tech: C++11),nan (Current Lang & Tech: CoffeeScript),commit_frequency,AssessJobLeaders,MajorUndergrad,nan (Describes benefits / perks of the work environment),nan (Most important aspect of new job opportunity: Company stage),JobSatPoints_13,nan (Years IT / Programming Experience),AIAgentImpactSomewhat disagree,YearsCode,SurveyEase,Select all that apply (Current Lang & Tech: Android),nan (Desktop Operating System: write-in),StackOverflowModeration,AssessJob4,StackOverflowJobsRecommend,nan (Stock Options/Profit Sharing Program),Which technology products do you own? (You can choose more than one) (iPhone),nan (How can companies improve interview process: Introduce me to boss),employment_status,Knowledge_8,nan (Describes company culture),nan (Current Lang & Tech: Matlab),NEWSOSites,nan (Most important aspect of new job opportunity: Company culture),AssessJobFinances,ChangeWorld,Age,LearnCodeOnline,nan (Current Lang & Tech: Redis),ProfessionalQuestion,AIAgentKnowWrite,nan (Current Lang & Tech: AngularJS),LanguagesWantEntry,nan (Servers),What is your involvement in purchasing? You can choose more than 1. (Influencer),Select all that apply (Most urgent info about job opportunity: Salary),"In an average week, how do you spend your time at work? (Developing new features)",nan (Future Lang & Tech: Rust),gender,AgreeDisagree1,Do you have a Stack Overflow Careers Profile?,nan (Perception of contact form: Xing),nan (Why answer: Demonstrate expertise),nan (Surfing the Internet),"If you make a software product, how does your company make money? (You can choose more than one) (Advertising)",nan (How can companies improve interview process: Fewer brainteasers),Professional,Where do you work remotely most of the time?,nan (Future Lang & Tech: PHP),AIAgentChallengesStrongly disagree,SOTagsHaveEntry,StackOverflowMakeMoney,WorkChallenge,SONewContent,AISearchWantToWorkWith,nan (I can buy anything I want without asking anyone),SO_Actions_7,nan (Python),tech_want,How many people work for your company?,WebFrameWorkedWith,StackOverflowJobSearch,nan (Current Lang & Tech: Hadoop),nan (How many hours programming as hobby per week?),ToolsTechAdmired,JobEmailPriorities5,NEWDevOpsImpt,PlatformAdmired,JobSatPoints_15_TEXT,nan (Future Lang & Tech: Java),SO_Actions_9,nan (Source control used: TFS),MentalHealth,StackOverflowBetter,nan (LinkedIn Inmail),Hobbyist,Have you changed jobs in the last 12 months?,nan (Future Lang & Tech: Arduino),WebframeDesireNextYear,AssessJobCommute,EntTeams,agree_problemsolving,nan (Fixing bugs),AINextMuch less integrated,How do you prefer to be contacted about job opportunities? (Email),TrueFalse_1,nan (Java),nan (Current Lang & Tech: F#),CommPlatformHaveEntr,NEWEdImpt,nan (Xbox 360),InTheZone,Exercise,NEWPurchaseResearch,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",nan (AppleTV),PlatformHaveEntry,TrueFalse_2,How would you best describe the industry you work in?,JobFactors,AIAgentObserveSecure,AIThreat,UK_Country,nan (Source control used: Perforce),nan (Arduino / Raspberry Pi),ImportantHiringAlgorithms,SOTagsWantToWorkWith,SOPartFreq,AIAgentImpactNeutral,LanguagesHaveEntry,How often do you visit job boards?,WelcomeChange,LanguageWantToWorkWith,ProfessionalTech,Select all that apply (Training & Education: No formal training),nan (Customers),hobby,Select all that apply (Why try Stack Overflow Careers: No spam),Knowledge_6,AdBlocker,nan (Hadoop),nan (Most annoying about job search: The Interview),desktop_os,InfluenceCloud,UpdateCV,nan (Xbox One),ImportantBenefits,nan (Excitement About the Company's Products),nan (Recommender),TechOppose_2,AdBlockerReasons,nan (Prefered Source Control),EquipmentSatisfiedRAM,Ethnicity,nan (Remote Status),LearnCodeChoose,nan (Tabs or Spaces),EmploymentAddl,Please rate your job/career satisfaction,nan (HDTV),YearsCodedJobPast,DevEnvsAdmired,"nan (User Equipment: Monitors, PCs, Laptops)",TechEndorse_9,nan (Future Lang & Tech: iOS),MetricAssess,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",OpSysProfessional use,nan (I click on ads that interest me),nan (Windows Tablet),Select all that apply (Why answer: Help a programmer in need),Currency,SocialMedia,TechOppose_7,TechEndorse_5,nan (Most annoying about job search: Taking time off work to interview),nan (Technical Support),nan (Perception of contact form: LinkedIn),WorkExp,CommPlatformWantToWorkWith,YearsCoding,"Including bonus, what is your annual compensation in USD?",AssessJobDiversity,InfluenceServers,AssessJob7,SO_Actions_1,DevEnvWantEntry,nan (Android Tablet),How do you use Stack Overflow? (Read other people's questions to solve my problems),nan (Preferred text editor: write-in),nan (C#),nan (Desktop Operating System),salary_midpoint,Knowledge_9,DatabaseWantEntry,AIAgentObsWrite,Select up to 3 (How can companies improve interview process: More live code),nan (Source control used: write-in),Have you visited / Are you aware of Stack Overflow Careers?,AISearchDevAdmired,WebframeHaveWorkedWith,Select up to 3 (Most annoying about job search: Finding time),WorkPayCare,nan (Why use Stack Overflow: Demonstrate expertise),nan (Who do you want to communicate with about a new job opportunity: Headhunter),What is your gender?,How did you find out about your current job?,nan (Current Lang & Tech: Ruby),AdsPriorities2,SO_Actions_5,nan (Wii U),University,how_to_improve_interview_process,EthicsReport,nan (I like that I can indicate ads I want to see less of),why_stack_overflow,StackOverflowCompanyPage,Extraversion,AIComplex,nan (Compensation: midpoint),InvestTimeTools,nan (Learning new skills),Check,AIAgentChallengesStrongly agree,ExCoderReturn,ITperson,"Including yourself, how many developers are employed at your company?",nan (Current Lang & Tech: PHP),TechEndorse_6,ImportantHiringCompanies,SOAI,CodeRev,JobContactPriorities4,developer_challenges,NEWJobHunt,EquipmentSatisfiedCPU,nan (Most urgent info about job opportunity: Company name),SurveyYear,JobSatPoints_14,UnderstandComputers,WorkPlan,AIBen,nan (Current Lang & Tech: Swift),nan (Why use Stack Overflow: Maintain online presence),How often are you contacted by recruiters?,InfluenceRecruitment,InfluenceInternet,TechEndorse_2,AIToolInterested in Using,Knowledge_2,JobSatPoints_11,SO_Actions_15,nan (Most important aspect of new job opportunity: Office location),ExCoderActive,MgrIdiot,nan (Future Lang & Tech: Python),AssessBenefits7,WebframeAdmired,YearsProgram,AssessJob5,nan (Why answer: I don't answer and I don't want to),nan (Most important aspect of new job opportunity: Tech stack),JobSearchStatus,"nan (Link to a Stack Overflow Careers Company Page or other source of more information about the company (videos, articles, etc))",SurveyLong,AgreeDisagree2,nan (CSS),"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",PurchaseInfluence,nan (Future Lang & Tech: Sharepoint),nan (TypeScript),TechOppose_15,ProfessionalCloud,ToolsTechWantToWorkWith,nan (I have a discretionary budget at my disposal),nan (Perception of contact form: Phone),nan (Future Lang & Tech: Redis),StackOverflowCopiedCode,Knowledge_7,How likely is it that a recommendation you make will be acted upon?,nan (Nook),AINextNeither different nor similar,SOFriction,Please rate the advertising you've seen on Stack Overflow (The ads are relevant),SOVisitFreq,LearnedHiring,InterestedAnswers,Salary,SurveyLength,CompetePeers,nan (Source control used: SVN),EdLevel,nan (Current Lang & Tech: C++11),AINextSomewhat similar,nan (Direct sales to consumers),nan (Future Lang & Tech: Dart),ExCoderBalance,OrgSize,US_State,nan (Current Lang & Tech: SQL Server),occupation,SOTimeSaved,nan (Training & Education: Industry certification),AIAgentOrchWrite,AdsAgreeDisagree2,"In an average week, how do you spend your time? (Developing new features)",nan (Source control used: Mercurial),EmbeddedHaveWorkedWith,nan (How can companies improve interview process: Introduce me to team),ChallengeMyself,StackOverflowCommunity,nan (Sales / Marketing),ProblemSolving,PlatformWantToWorkWith,JobContactPriorities3,nan (Preferred text editor),AINextVery different,TechOppose_5,nan (Check Writer),nan (Current Lang & Tech: Windows Phone),Age1stCode,nan (Why answer: Self promotion),women_on_team,nan (Future Lang & Tech: Write-In),nan (Training & Education: Online Class),nan (Refactoring / code quality),nan (Current Lang & Tech: Wordpress),dev_environment,nan (Future Lang & Tech: Salesforce),DiversityImportant,AINextSomewhat different,AssessJob10,"nan (Positive Organization Structure (not much bureaucracy, helpful management))",MgrMoney,TechOppose_13,nan (Future Lang & Tech: Cloud),TimeAnswering,TechEndorse_13,"nan ($10,001 - $25,000)",nan (F#),nan (Most annoying about job search: Finding job I'm qualified for),AIDevWantToWorkWith,nan (Age),nan (PS4),nan (How often contacted by recruiters),InfluenceDatabase,AssessJob8,nan (Commuting),ExCoderWillNotCode,Frustration,KinshipDevelopers,StackOverflowVisit,SO_Actions_4,NEWCollabToolsWantToWorkWith,AIToolPlan to mostly use AI,JobEmailPriorities7,EnjoyDebugging,nan (HTML5),HackathonReasons,nan (Other tablet),JobSeekingStatus,AuditoryEnvironment,agree_legacy,Knowledge_3,AIDevHaveWorkedWith,experience_midpoint,new_job_value,TabsSpaces,TechEndorse,nan (Autonomy Over Budget/Expenditures),"Please rate how important each of the following characteristics of a company/job offer are to you. Please select a MAXIMUM of 3 items as ""Non-Negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (High Base Compensation)",ScreenName,SOHow,nan (Consultants),programming_ability,CompanyType,AdsPriorities5,JobSatPoints_15,star_wars_vs_star_trek,nan (Haskell),WantWorkFramework,nan (Most annoying about job search: Finding interesting job),nan,nan (How often are Stack Overflow's answers helpful),MiscTechDesireNextYear,nan (Most important aspect of new job opportunity: Equity),nan (Who do you want to communicate with about a new job opportunity: In-house tech recruiter),nan (Why try Stack Overflow Careers: Selection of revelant jobs),PurchaseWhat,InfluenceCommunication,RemoteWork,InfluenceHardware,VCHostingPersonal use,nan (Current Lang & Tech: Sharepoint),DevEnvsWantToWorkWith,AIToolPlan to partially use AI,AIAgentChallengesSomewhat agree,"What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",FormalEducation,self_identification,StackOverflowAdsDistracting,MiscTechWorkedWith,SOVisit1st,nan (AngularJS),nan (Current Lang & Tech: Arduino),nan (Why use Stack Overflow: Communicate with others),nan (Consulting),nan (How can companies improve interview process: Show me workplace),ExCoderBelonged,nan (Kindle Fire),How many developers are employed at your company?,MiscTechHaveWorkedWith,SOTagsAdmired,AssessJob3,BuyNewTool,nan (Most important aspect of new job opportunity: Job title),nan (Android tablet),nan (Why use Stack Overflow: I don't use Stack Overflow),What is your current Stack Overflow reputation?,nan (Most urgent info about job opportunity: Tech stack),nan (Regular Mobile Phone),TimeFullyProductive,WantWorkLanguage,StackOverflowHasAccount,StackOverflowNewQuestion,"nan (Quality of Workstation (dream machine, 30inch monitors, etc))",IDE,nan (Why use Stack Overflow: Receive help on personal projects),remote,LanguageAdmired,JobSeek,AssessJobRemote,nan (Limited night / weekend work),nan (Training & Education: Boot camp or night school),StackOverflowDevStory,nan (Future Lang & Tech: Cordova),StackOverflowMetaChat,important_sameend,JobSatPoints_9,JobEmailPriorities6,InfluenceDeptTech,Do you enjoy working remotely?,nan (Future Lang & Tech: C),Race,FizzBuzz,DevEnvsHaveWorkedWith,SOTagsHaveWorkedWith,ProgramHobby,nan (Lots of Control Over Your Own Work),DatabaseHaveWorkedWith,VCInteraction,TechOppose_3,NewRole,Q120,AIFuture,nan (Node.js),nan (Current Lang & Tech: Spark),nan (How can companies improve interview process: Better preparation),AdsAgreeDisagree1,ImpSyn,JobSatPoints_7,AIHuman,OfficeStackAsyncWantToWorkWith,nan (Current Lang & Tech: Go),How often do you find solutions to your programming problems on Stack Overflow without asking a new question?,AssessJob1,AIToolCurrently mostly AI,AIAgentImpactStrongly agree,nan (Other (please specify)),EthicsResponsible,nan (Future Lang & Tech: Objective-C),WantWorkPlatform,HaveWorkedPlatform,FriendsDevelopers,nan (Future Lang & Tech: Haskell),NEWJobHuntResearch,industry,LanguageHaveWorkedWith,HoursPerWeek,ExCoderNotForMe,nan (Open to new job opportunities),JobSatPoints_16,What operating system do you use the most?,nan (No mobile app),nan (Convenient Commute or Telecommute Options),nan (How important is remote when evaluating new job opportunity?),WebframeWantToWorkWith,SOComm,JobSatPoints_4,nan (Current Lang & Tech: Objective-C),Which best describes the size of your company?,nan (Most urgent info about job opportunity: Product details),nan (Future Lang & Tech: Windows Phone),"nan (>$150,000)",nan (Future Lang & Tech: F#),nan (Using Stack Exchange),nan (How many caffeinated beverages per day?),nan (Future Lang & Tech: Perl),job_discovery,nan (Prefered Source Control: write-in),JobProfile,nan (other (please specify)),nan (PHP),UndergradMajor,nan (Future Lang & Tech: Hadoop),AISearchDevHaveWorkedWith,TechEndorse_3,YearsCodePro,Which technologies are you excited about? (Node.js),OfficeStackSyncHaveWorkedWith,SalaryType,nan (Future Lang & Tech: Node.js),DeveloperType,AssessJobRole,AISearchHaveWorkedWith,EthicalImplications,why_learn_new_tech,AssessJobIndustry,nan (Perl),nan (Changed Jobs in last 12 Months),nan (Country),SelfTaughtTypes,InfluenceTechStack,MiscTechWantToWorkWith,nan (Approver),How would you best describe the industry you currently work in?,AIAcc,What types of purchases are you involved in? (Hardware),OpSys,nan (Why use Stack Overflow: Can't do job without it),experience_range,nan (Most important aspect of new job opportunity: Remote working),AssessBenefits3,nan (What ads? I use an ad blocker),nan (Blu-Ray),CheckInCode,nan (Netbook),agree_loveboss,CodingActivities,PronounceGIF,nan (Why answer: Sense of responsibility to developers),Frequency_1,nan (jQuery),nan (Current Lang & Tech: Python),nan (High Caliber Team (is everyone else smart/hardworking)),nan (The ads are Informative),UnitTests,OpSysPersonal use,TechEndorse_1,nan (C),EquipmentSatisfiedMonitors,nan (Objective-C),AssessJob9,MobileDeveloperType,nan (MongoDB),Frequency_2,NEWOvertime,nan (PS3),nan (Finance),WorkWeekHrs,nan (The ads are entertaining),nan (Includes salary information),CurrencySymbol,open_to_new_job,AIChallenges,nan (Future Lang & Tech: C++),important_variety,JobEmailPriorities1,nan (Current Lang & Tech: C++),nan (Why try Stack Overflow Careers: Jobs site for programmers),StackOverflowAdsRelevant,Are you currently looking for a job or open to new opportunities?,JobContactPriorities5,SkipMeals,big_mac_index,AssessBenefits11,nan (Current Lang & Tech: Haskell),nan (Why answer: I don't answer but I want to),SO_Actions_16,BoringDetails,TechEndorseIntro,JobEmailPriorities4,Do you work remotely?,nan (Appealing message traits: Code or projects mentioned),EmploymentStatus,nan (Future Lang & Tech: Spark),nan (Current Lang & Tech: C#),nan (Industry),SOVisitTo,nan (Future Lang & Tech: Clojure),YearsCodingProf,Trans,Industry,nan (Perception of contact form: Twitter),AISearchDevWantToWorkWith,nan (CoffeeScript),ResumeUpdate,"You answered you don't have a Careers profile, can you elaborate why?",nan (JavaScript),AssessJob6,nan (Product Managers),nan (SQL),nan (Meetings),nan (Identification With the Company/Goals),nan (Appealing message traits: Company culture described),WebframeWorkedWith,ToolsTechHaveWorkedWith,BuildvsBuy,AssessJobOffice,"If your company has a native mobile app, what platforms do you support? (iPhone)",HypotheticalTools3,nan (Most annoying about job search: Interesting companies rarely respond),nan (Why try Stack Overflow Careers: Showcase Stack Overflow activity),important_newtech,ExCoderSkills,StackOverflowWhatDo,StackOverflowDescribes,AssessJobCompensation,nan (Why try Stack Overflow Careers: Jobs are on Stack Overflow),ImportantHiringPMExp,ImportantHiringEducation,JobSatPoints_8,Which languages are you proficient in? (Java),nan (Android Phone),AINextMuch more integrated,nan (Other gaming system),nan (Mobile app sales),team_size_range,nan (Wii),Overpaid,YearsCodedJob,StackOverflowHelpful,CompFreq,nan (Training & Education: On the job),HoursOutside,nan (Other netbook),Onboarding,AIAgentChallengesSomewhat disagree,AssessJobProduct,LastInt,JobSat,AssessJobTech,ImportantHiringRep,"nan (High Quality Office Space (amenities, lounge space, free food, etc))",PurchaseHow,agree_nightcode,DatabaseDesireNextYear,Do you have a Stack Overflow Careers 2.0 Profile?,NEWStuck,SO_Actions_10,EmbeddedAdmired,AIExplain,TBranch,job_satisfaction,important_buildexisting,StackOverflowParticipate,agree_tech,What best describes your career / job satisfaction?,HypotheticalTools4,nan (Future Lang & Tech: Go),AIInteresting,nan (Stack Overflow Careers Message),nan (Current Lang & Tech: Rust),nan (Why use Stack Overflow: To give help),HoursComputer,CompanySize,CommPlatformAdmired,nan (Source control used: I don't use source control),nan (Most urgent info about job opportunity: Office location),ConvertedCompYearly,LearnCodeAI,StackOverflowDevices,MilitaryUS,AIModelsWantToWorkWith,nan (Other Stack Exchange (please specify)),VersionControlSystem,nan (Appealing message traits: Benefits & Perks),AIEthics,Knowledge_1,HypotheticalTools1,nan (Most important aspect of new job opportunity: Advancement),OpenSource,nan (Purchaser),HaveWorkedLanguage,FrameworkWorkedWith,WebframeHaveEntry,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one) (Influencer),ImportantHiringTechExp,Which of the following languages or technologies have you used significantly in the past year? (C),nan (Most important aspect of new job opportunity: Important decisions),nan (C++11),AIModelsHaveWorkedWith,ConvertedComp,nan (Future Lang & Tech: SQL Server),EduOther,AIAgentImpactStrongly disagree,Which desktop operating system do you use the most?,nan (Future Lang & Tech: SQL),nan (Current Lang & Tech: Java),HaveWorkedDatabase,Hobby,JobEmailPriorities2,WorkLoc,nan (Compensation),company_size_range,TechEndorse_8,nan (How frequently land on or read Stack Overflow),nan (Source control used: CVS),nan (Source control used: Legacy / Custom),country,NEWLearn,nan (Perception of contact form: Email),nan (JQuery),AIModelsWantEntry,OfficeStackSyncAdmired,nan (Describes the team I will work on),Containers,nan (Perception of contact form: Facebook),TechList,nan (Current Lang & Tech: Dart),agree_mars,AISelect,AssessBenefits8,How old are you?,nan (Source control used: Bitkeeper),nan (Cordova),salary_range,nan (Phone),JobSatPoints_10,AdsActions,AIAgentChange,agree_adblocker,ImportantHiringTitles,important_buildnew,StackOverflowAnswer,important_control,VersionControl,nan (Recruitment Tools & Services),Gender,AdsPriorities6,DatabaseChoice,LearningNewTech,"nan ($41,000 - $75,000)",nan (Appealing message traits: Stack Overflow activity mentioned),NEWDevOps,nan (Prefered IDE theme),CollaborateRemote,dogs_vs_cats,EducationTypes,BlockchainIs,nan (Grants / outside fund-raising),ExpectedSalary,OffOn,nan (Most annoying about job search: Writing and updating CV),EmbeddedWantToWorkWith,nan (C++),PlatformDesireNextYear,SO_Actions_6,DevEnvHaveEntry,nan (I've taken a trial/purchased a product from ads),nan (Perception of recruiter contact),NonDeveloperType,Did you participate in the Apptivate contest?,"nan ($100,001 - $150,000)",StackOverflowJobs,NEWOffTopic,nan (Blackberry),OperatingSystem,nan (Training & Education: Mentorship),EducationParents,nan (Most important aspect of new job opportunity: Health insurance),nan (Xbox),ErgonomicDevices,nan (Future Lang & Tech: R),Knowledge_5,HighestEducationParents,important_promotion,nan (Purchasing Power),LanguageWorkedWith,nan (Current Lang & Tech: Cassandra),"nan (I influence purchasing decisions, but don't have final approval)",nan (Appealing message traits: Stack Overflow Company Page),job_search_annoyance,WorkStart,DevType,NEWOtherComms,AIAgent_Uses,DifficultCommunication,nan (Future Lang & Tech: C#),nan (No Involvement),nan (Future Lang & Tech: Matlab),nan (Training & Education: Other),ExCoder10Years,AssessJobProjects,ImportantHiringCommunication,Select all that apply (Why use Stack Overflow: Help for job),nan (Current Lang & Tech: Perl),WebFrameDesireNextYear,ProjectManagement,TimeAfterBootcamp,AIAgentChallengesNeutral,LearnCodeCoursesCert,nan (Current Lang & Tech: C),nan (Future Lang & Tech: JavaScript),nan (Who do you want to communicate with about a new job opportunity: Manager),nan (Future Lang & Tech: Wordpress),nan (Most important aspect of new job opportunity: Flexible work options),Employment,Knowledge_4,SOTagsWant Entry,HypotheticalTools5,nan (Looking for a job),nan (Current Lang & Tech: Visual Basic),LastHireDate,JobSecurity,WorkRemote,TechOppose_1
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$25,001 - $40,000",,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Africa,,,,,,,,,,,,,,,,,,,,,,,,,Software,,,,,,,,,,,,,,,,,,,,,,,,,,,Mobile,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,<$100,,,,,,,,,,,,,,<2,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Consulting,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FML,,,,,,,,,,,,,,,,,,,,,,,Student / Unemployed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,,,,Start Up (1-25),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Approver,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,< 20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows 7,,,,,,,,Mature Small Business (25-100),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SQL,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,Other netbook,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,India,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SaaS,,,,,,,,,I'm a Seller,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,,,,Mid Sized (100-999),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,SQL,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Kindle,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Germany,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Foundation / Non-Profit,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,,,,,,,,,,Student / Unemployed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Mobile Phone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,,,,Student,,,,,,,,,,,Haskell,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,Other gaming system,,,Wii,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,< 20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No Involvement,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Kindle,,,,,,,,,,,,,,,,,,,,,,,,,"$75,001 - $100,000",,,,,,,,,,,,,,,,,,"$25,001 - $40,000",,,,,,,,,,,,,,,,,,,,,,"Executive (VP of Eng, CTO, CIO, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,Don't know,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Asia,,,,,,,,,,,,,,,,,,,,,Android,,,,Software,,,,,,,,,Stack Overflow,,,,,,,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,,,,,,11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Servers,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It pays the bills,,,,"User Equipment: Monitors, PCs, Laptops",,,,,,,,,,,,,,,,,,,"$80,000 - $100,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CSS,"<$10,000",,,,,,,,,,,,I run this place,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"$10,001 - $25,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Regular Mobile Phone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,,,,,Start Up (1-25),,,">$150,000",,,,,,,,,PHP,,,,,,,,,,,,,,,,Perl,,,,,,Approver,,,Hardware,,,,,,,,,,,,,,,,,,,,,,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,JavaScript,,,SQL,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Purchaser,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,35-39,,,,,,,,,,,,,,,,,,,"$41,000 - $75,000",,,,,,,,,,,,,C++,,,,,,,,"$100,001 - $150,000",,,,,,,,Xbox,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
# Create a stratified sample of 20% of the data
if combined_df is not None:
    # Calculate 20% sample size for each year
    sample_size = 0.2
    
    # Perform stratified sampling
    stratified_sample = combined_df.groupby('SurveyYear', group_keys=False).apply(
        lambda x: x.sample(frac=sample_size, random_state=42)
    ).reset_index(drop=True)
    
    # Export to CSV
    output_file = 'stackoverflow_survey_stratified_sample.csv'
    stratified_sample.to_csv(output_file, index=False)
    
    print(f"Original dataset size: {len(combined_df):,} rows")
    print(f"Sampled dataset size: {len(stratified_sample):,} rows")
    print(f"\nSample size by year:")
    print(stratified_sample['SurveyYear'].value_counts().sort_index())
    print(f"\nData exported to: {output_file}")

Original dataset size: 772,599 rows
Sampled dataset size: 154,521 rows

Sample size by year:
SurveyYear
2011      563
2012     1249
2013     1948
2014     1529
2015     5217
2016    11206
2017    10278
2018    19771
2019    17777
2020    12892
2021    16688
2022    14654
2023    17837
2024    13087
2025     9825
Name: count, dtype: int64

Data exported to: stackoverflow_survey_stratified_sample.csv


In [14]:
stratified_sample.to_csv('stackoverflow_survey_stratified_sample.csv')

## Adjust df_use based on the dataframe you wwant to use (sample_df, stratified_sample, etc)

In [None]:
sample_df = pd.read_csv('stackoverflow_survey_stratified_sample.csv')

In [15]:
df_use = stratified_sample.copy()

In [16]:
# Remove any columns with 'Unnamed' in their name from the combined dataframe
if df_use is not None:
    df_use = df_use.loc[:, ~df_use.columns.str.contains('^Unnamed')]

In [None]:
with pd.option_context('display.max_rows', None):
    display(df_use.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
AIAgentChallengesStrongly agree,4637.0,103.0,I am concerned about the accuracy of the information provided by AI agents.;I have concerns about the security and privacy of data when using AI agents.,928.0,,,,,,,
nan (Future Lang & Tech: C),523.0,1.0,C,523.0,,,,,,,
nan (Who do you want to communicate with about a new job opportunity: In-house recruiter),3089.0,3.0,Tolerate,1999.0,,,,,,,
TimeSearching,21687.0,5.0,30-60 minutes a day,8166.0,,,,,,,
nan (How can companies improve interview process: Show me workplace),1165.0,1.0,Show me the space in which I will work,1165.0,,,,,,,
nan (Why answer: I don't answer but I want to),1190.0,1.0,"I don't answer much (or at all), but I want to answer more",1190.0,,,,,,,
AISent,28100.0,6.0,Favorable,12867.0,,,,,,,
CommunicationTools,11387.0,709.0,"Office / productivity suite (Microsoft Office, Google Suite, etc.)",718.0,,,,,,,
TechEndorseIntro,7551.0,3.0,Work,5293.0,,,,,,,
Age,109280.0,109.0,25-34 years old,26873.0,,,,,,,


In [17]:
# Remove columns with 'nan' in their names from combined_df and individual year dataframes
if df_use is not None:
    # For combined dataframe
    nan_columns = df_use.columns[df_use.columns.str.contains('nan', case=False, na=False)]
    if len(nan_columns) > 0:
        print("Removing columns containing 'nan' from combined dataframe:")
        print(list(nan_columns))
        df_use = df_use.drop(columns=nan_columns)

print("\nDone cleaning column names.")

Removing columns containing 'nan' from combined dataframe:
['nan (Who do you want to communicate with about a new job opportunity: Developer)', 'nan (Perception of contact form: Stack Overflow Careers)', 'nan (Current Lang & Tech: Cloud)', 'nan (PhoneGap)', 'nan (Occupation)', 'nan (Training & Education: Some college, but no CS degree)', 'nan (Appealing message traits: Salary information)', 'nan (Go)', 'nan (Current Lang & Tech: Write-In)', 'nan (Kindle)', 'nan (Software as a service / recurring billing)', 'nan (Appealing message traits: Team described)', 'nan (Who do you want to communicate with about a new job opportunity: In-house recruiter)', 'nan (Most important aspect of new job opportunity: Work - Life balance)', 'nan ($75,001 - $100,000)', 'nan (Current Lang & Tech: LAMP)', 'nan (iPad)', 'nan (Most important aspect of new job opportunity: Quality of colleagues)', 'nan (Boxee)', 'nan (Most urgent info about job opportunity: Benefits)', 'nan (Build my online reputation)', 'nan (C

In [18]:
# Update all column names in combined_df and each dataframe in dataframes to be lower case
df_use.columns = [col.lower() for col in df_use.columns]

In [19]:
df_use.head()

Unnamed: 0,toolcountpersonal,newcollabtoolshaveworkedwith,techendorse_13_text,numbermonitors,vchostingprofessional use,select up to 3 (appealing message traits: message is personalized),jobemailpriorities3,what advertisers do you remember seeing on stack overflow? (open-ended response),select all that apply (future lang & tech: android),country,influenceviztools,truefalse_3,aidangerous,techoppose_16,lastnewjob,ainextvery similar,officestackasyncadmired,jobcontactpriorities2,blockchain,aimodelschoice,importanthiringgettingthingsdone,student,assessjobexp,agree_notice,methodology,newonboardgood,equipmentsatisfiedrw,opensourcer,assessbenefits6,un_subregion,otherpeoplescode,stackoverflowconsidermember,accessibility,age_range,adsagreedisagree3,jobsatisfaction,so_actions_3,what other departments / roles do you interact with regularly? (system administrators),aiagentextwrite,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",sohowmuchtime,languagedesirenextyear,jobsatpoints_6,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),currencydesc,select up to 3 (most important aspect of new job opportunity: salary),frameworkdesirenextyear,occupation_group,airesponsible,sexuality,aitoolcurrently using,adspriorities1,aiagents,aiagentexternal,surveytoolong,agentusesgeneral,blockchainorg,ainextmore integrated,homeremote,coderevhrs,timesearching,resumeprompted,databasewanttoworkwith,convertedsalary,assessjob2,equipmentsatisfiedstorage,questionsinteresting,what country do you live in?,waketime,soaccount,questionsconfusing,assessbenefits1,techdoc,which of the following best describes your occupation?,aiagentimpactsomewhat agree,platformhaveworkedwith,frequency_3,cousineducation,misctechadmired,clickykeys,importanthiringopensource,adspriorities4,soduration,officestackwantentry,so_region,languagechoice,aimodelsadmired,influenceworkstation,surveyeasy,mgrwant,important_companymission,respondent,agree_alcohol,assessjobprofdevel,how large is the team that you work on?,techoppose_9,officestackhaveentry,hypotheticaltools2,seriouswork,assessbenefits4,tech_do,have you visited / are you aware of stack overflow careers 2.0?,platformworkedwith,adspriorities3,learncode,jobsatpoints_5,techendorse_7,aitoolnot interested in using,stackoverflowrecommend,important_wfh,aiagentknowledge,aliens,collector,rightwrongway,assessbenefits9,ainextno change,newcollabtoolsadmired,devenvschoice,stackoverflowjoblisting,rep_range,techoppose_15_text,toolcountwork,platformwantentry,stackoverflowsatisfaction,age_midpoint,stackoverflowfoundanswer,hopefiveyears,assessbenefits5,ainextless integrated,assessbenefits10,unit_testing,raceethnicity,what country or region do you live in?,newcollabtoolsdesirenextyear,dependents,aiopen,adspriorities7,devenviron,jobcontactpriorities1,sofindanswer,careersat,communicationtools,agree_diversity,databasehaveentry,webframechoice,webframewantentry,which us state or territory do you live in?,haveworkedframework,newcollabtoolsworkedwith,newpurplelink,aimodelshaveentry,databaseworkedwith,select all that apply (source control used: git),which of our sites do you frequent most?,aitoolcurrently partially ai,webdevelopertype,education,officestacksyncwanttoworkwith,jobsatpoints_1,shipit,adblockerdisable,aiagentorchestration,so_dev_content,techendorse_4,sojobs,what type of project are you developing?,aisent,sexualorientation,interview_likelihood,ethicschoice,were you aware of the apptivate contest?,annoyingui,databaseadmired,aifrustration,responseid,assessbenefits2,visit_frequency,buildingthings,icorpm,influenceconsultants,betterlife,educationimportant,agreedisagree3,platformchoice,techoppose_11,"in the last 12 months, how much money have you spent on personal technology-related purchases?",ailearnhow,commplatformwantentr,mainbranch,assessjobdept,officestackasynchaveworkedwith,wantworkdatabase,comptotal,important_ownoffice,how many years of it/programming experience do you have?,careersatisfaction,so_actions_15_text,aitooldon't plan to use ai for this task,commplatformhaveworkedwith,commit_frequency,assessjobleaders,majorundergrad,jobsatpoints_13,aiagentimpactsomewhat disagree,yearscode,surveyease,select all that apply (current lang & tech: android),stackoverflowmoderation,assessjob4,stackoverflowjobsrecommend,which technology products do you own? (you can choose more than one) (iphone),employment_status,knowledge_8,newsosites,changeworld,age,learncodeonline,professionalquestion,aiagentknowwrite,languageswantentry,what is your involvement in purchasing? you can choose more than 1. (influencer),select all that apply (most urgent info about job opportunity: salary),"in an average week, how do you spend your time at work? (developing new features)",gender,agreedisagree1,do you have a stack overflow careers profile?,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",professional,where do you work remotely most of the time?,aiagentchallengesstrongly disagree,sotagshaveentry,stackoverflowmakemoney,workchallenge,sonewcontent,aisearchwanttoworkwith,so_actions_7,tech_want,how many people work for your company?,webframeworkedwith,stackoverflowjobsearch,toolstechadmired,jobemailpriorities5,newdevopsimpt,platformadmired,jobsatpoints_15_text,so_actions_9,mentalhealth,stackoverflowbetter,hobbyist,have you changed jobs in the last 12 months?,webframedesirenextyear,assessjobcommute,entteams,agree_problemsolving,ainextmuch less integrated,how do you prefer to be contacted about job opportunities? (email),truefalse_1,commplatformhaveentr,newedimpt,inthezone,exercise,newpurchaseresearch,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",platformhaveentry,truefalse_2,how would you best describe the industry you work in?,jobfactors,aiagentobservesecure,aithreat,uk_country,importanthiringalgorithms,sotagswanttoworkwith,sopartfreq,aiagentimpactneutral,languageshaveentry,how often do you visit job boards?,welcomechange,languagewanttoworkwith,professionaltech,select all that apply (training & education: no formal training),hobby,select all that apply (why try stack overflow careers: no spam),knowledge_6,adblocker,desktop_os,influencecloud,updatecv,importantbenefits,techoppose_2,adblockerreasons,equipmentsatisfiedram,ethnicity,learncodechoose,employmentaddl,please rate your job/career satisfaction,yearscodedjobpast,devenvsadmired,techendorse_9,metricassess,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",opsysprofessional use,select all that apply (why answer: help a programmer in need),currency,socialmedia,techoppose_7,techendorse_5,workexp,commplatformwanttoworkwith,yearscoding,"including bonus, what is your annual compensation in usd?",assessjobdiversity,influenceservers,assessjob7,so_actions_1,devenvwantentry,how do you use stack overflow? (read other people's questions to solve my problems),salary_midpoint,knowledge_9,databasewantentry,aiagentobswrite,select up to 3 (how can companies improve interview process: more live code),have you visited / are you aware of stack overflow careers?,aisearchdevadmired,webframehaveworkedwith,select up to 3 (most annoying about job search: finding time),workpaycare,what is your gender?,how did you find out about your current job?,adspriorities2,so_actions_5,university,how_to_improve_interview_process,ethicsreport,why_stack_overflow,stackoverflowcompanypage,extraversion,aicomplex,investtimetools,check,aiagentchallengesstrongly agree,excoderreturn,itperson,"including yourself, how many developers are employed at your company?",techendorse_6,importanthiringcompanies,soai,coderev,jobcontactpriorities4,developer_challenges,newjobhunt,equipmentsatisfiedcpu,surveyyear,jobsatpoints_14,understandcomputers,workplan,aiben,how often are you contacted by recruiters?,influencerecruitment,influenceinternet,techendorse_2,aitoolinterested in using,knowledge_2,jobsatpoints_11,so_actions_15,excoderactive,mgridiot,assessbenefits7,webframeadmired,yearsprogram,assessjob5,jobsearchstatus,surveylong,agreedisagree2,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",purchaseinfluence,techoppose_15,professionalcloud,toolstechwanttoworkwith,stackoverflowcopiedcode,knowledge_7,how likely is it that a recommendation you make will be acted upon?,ainextneither different nor similar,sofriction,please rate the advertising you've seen on stack overflow (the ads are relevant),sovisitfreq,learnedhiring,interestedanswers,salary,surveylength,competepeers,edlevel,ainextsomewhat similar,excoderbalance,orgsize,us_state,occupation,sotimesaved,aiagentorchwrite,adsagreedisagree2,"in an average week, how do you spend your time? (developing new features)",embeddedhaveworkedwith,challengemyself,stackoverflowcommunity,problemsolving,platformwanttoworkwith,jobcontactpriorities3,ainextvery different,techoppose_5,age1stcode,women_on_team,dev_environment,diversityimportant,ainextsomewhat different,assessjob10,mgrmoney,techoppose_13,timeanswering,techendorse_13,aidevwanttoworkwith,influencedatabase,assessjob8,excoderwillnotcode,frustration,kinshipdevelopers,stackoverflowvisit,so_actions_4,newcollabtoolswanttoworkwith,aitoolplan to mostly use ai,jobemailpriorities7,enjoydebugging,hackathonreasons,jobseekingstatus,auditoryenvironment,agree_legacy,knowledge_3,aidevhaveworkedwith,experience_midpoint,new_job_value,tabsspaces,techendorse,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",screenname,sohow,programming_ability,companytype,adspriorities5,jobsatpoints_15,star_wars_vs_star_trek,wantworkframework,misctechdesirenextyear,purchasewhat,influencecommunication,remotework,influencehardware,vchostingpersonal use,devenvswanttoworkwith,aitoolplan to partially use ai,aiagentchallengessomewhat agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",formaleducation,self_identification,stackoverflowadsdistracting,misctechworkedwith,sovisit1st,excoderbelonged,how many developers are employed at your company?,misctechhaveworkedwith,sotagsadmired,assessjob3,buynewtool,what is your current stack overflow reputation?,timefullyproductive,wantworklanguage,stackoverflowhasaccount,stackoverflownewquestion,ide,remote,languageadmired,jobseek,assessjobremote,stackoverflowdevstory,stackoverflowmetachat,important_sameend,jobsatpoints_9,jobemailpriorities6,influencedepttech,do you enjoy working remotely?,race,fizzbuzz,devenvshaveworkedwith,sotagshaveworkedwith,programhobby,databasehaveworkedwith,vcinteraction,techoppose_3,newrole,q120,aifuture,adsagreedisagree1,impsyn,jobsatpoints_7,aihuman,officestackasyncwanttoworkwith,how often do you find solutions to your programming problems on stack overflow without asking a new question?,assessjob1,aitoolcurrently mostly ai,aiagentimpactstrongly agree,ethicsresponsible,wantworkplatform,haveworkedplatform,friendsdevelopers,newjobhuntresearch,industry,languagehaveworkedwith,hoursperweek,excodernotforme,jobsatpoints_16,what operating system do you use the most?,webframewanttoworkwith,socomm,jobsatpoints_4,which best describes the size of your company?,job_discovery,jobprofile,undergradmajor,aisearchdevhaveworkedwith,techendorse_3,yearscodepro,which technologies are you excited about? (node.js),officestacksynchaveworkedwith,salarytype,developertype,assessjobrole,aisearchhaveworkedwith,ethicalimplications,why_learn_new_tech,assessjobindustry,selftaughttypes,influencetechstack,misctechwanttoworkwith,how would you best describe the industry you currently work in?,aiacc,what types of purchases are you involved in? (hardware),opsys,experience_range,assessbenefits3,checkincode,agree_loveboss,codingactivities,pronouncegif,frequency_1,unittests,opsyspersonal use,techendorse_1,equipmentsatisfiedmonitors,assessjob9,mobiledevelopertype,frequency_2,newovertime,workweekhrs,currencysymbol,open_to_new_job,aichallenges,important_variety,jobemailpriorities1,stackoverflowadsrelevant,are you currently looking for a job or open to new opportunities?,jobcontactpriorities5,skipmeals,big_mac_index,assessbenefits11,so_actions_16,boringdetails,techendorseintro,jobemailpriorities4,do you work remotely?,employmentstatus,sovisitto,yearscodingprof,trans,industry.1,aisearchdevwanttoworkwith,resumeupdate,"you answered you don't have a careers profile, can you elaborate why?",assessjob6,webframeworkedwith.1,toolstechhaveworkedwith,buildvsbuy,assessjoboffice,"if your company has a native mobile app, what platforms do you support? (iphone)",hypotheticaltools3,important_newtech,excoderskills,stackoverflowwhatdo,stackoverflowdescribes,assessjobcompensation,importanthiringpmexp,importanthiringeducation,jobsatpoints_8,which languages are you proficient in? (java),ainextmuch more integrated,team_size_range,overpaid,yearscodedjob,stackoverflowhelpful,compfreq,hoursoutside,onboarding,aiagentchallengessomewhat disagree,assessjobproduct,lastint,jobsat,assessjobtech,importanthiringrep,purchasehow,agree_nightcode,databasedesirenextyear,do you have a stack overflow careers 2.0 profile?,newstuck,so_actions_10,embeddedadmired,aiexplain,tbranch,job_satisfaction,important_buildexisting,stackoverflowparticipate,agree_tech,what best describes your career / job satisfaction?,hypotheticaltools4,aiinteresting,hourscomputer,companysize,commplatformadmired,convertedcompyearly,learncodeai,stackoverflowdevices,militaryus,aimodelswanttoworkwith,versioncontrolsystem,aiethics,knowledge_1,hypotheticaltools1,opensource,haveworkedlanguage,frameworkworkedwith,webframehaveentry,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),importanthiringtechexp,which of the following languages or technologies have you used significantly in the past year? (c),aimodelshaveworkedwith,convertedcomp,eduother,aiagentimpactstrongly disagree,which desktop operating system do you use the most?,haveworkeddatabase,hobby.1,jobemailpriorities2,workloc,company_size_range,techendorse_8,country.1,newlearn,aimodelswantentry,officestacksyncadmired,containers,techlist,agree_mars,aiselect,assessbenefits8,how old are you?,salary_range,jobsatpoints_10,adsactions,aiagentchange,agree_adblocker,importanthiringtitles,important_buildnew,stackoverflowanswer,important_control,versioncontrol,gender.1,adspriorities6,databasechoice,learningnewtech,newdevops,collaborateremote,dogs_vs_cats,educationtypes,blockchainis,expectedsalary,offon,embeddedwanttoworkwith,platformdesirenextyear,so_actions_6,devenvhaveentry,nondevelopertype,did you participate in the apptivate contest?,stackoverflowjobs,newofftopic,operatingsystem,educationparents,ergonomicdevices,knowledge_5,highesteducationparents,important_promotion,languageworkedwith,job_search_annoyance,workstart,devtype,newothercomms,aiagent_uses,difficultcommunication,excoder10years,assessjobprojects,importanthiringcommunication,select all that apply (why use stack overflow: help for job),webframedesirenextyear.1,projectmanagement,timeafterbootcamp,aiagentchallengesneutral,learncodecoursescert,employment,knowledge_4,sotagswant entry,hypotheticaltools5,lasthiredate,jobsecurity,workremote,techoppose_1
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,United States of America,,,,,,,,,,,,,,California,,,,,,,Stack Overflow,,,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,11,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Services,,,,,,,,,,,,,,,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,Start Up (1-25),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,"$2,001-$3,000",,,,,,,,,11,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,"$40,000 - $60,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,Start Up (1-25),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,40-50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,South America,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,,11,,,,,,,,,,,,,,,,iPhone,,,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,"$20,000 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows 7,,,,Mature Small Business (25-100),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Asia,,,,,,,,,,,,,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Healthcare,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FML,,,,,,,,,,,,,,,"$60,000 - $80,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,Mid Sized (100-999),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,,,"<$20,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,"Other (not working, consultant, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
# Find columns containing 'country'
country_columns = [col for col in df_use.columns if 'country' in str(col).lower()]

print("Columns containing 'country':")
for col in country_columns:
    try:
        # Select the column(s). If multiple columns share the same name this returns a DataFrame.
        selected = df_use.loc[:, col]
        # If a DataFrame is returned (duplicate column names), collapse to a single Series by taking
        # the first non-null value across duplicates for each row.
        if isinstance(selected, pd.DataFrame):
            if selected.shape[1] > 1:
                print(f"\nWarning: column name '{col}' is duplicated ({selected.shape[1]} columns). Combining duplicates by taking first non-null value.")
            series = selected.bfill(axis=1).iloc[:, 0]
        else:
            series = selected.squeeze()

        print(f"\nColumn: {col}")
        print("Top 5 most common values and their counts:")
        counts = series.fillna('NULL').value_counts().head()
        print(counts)

        # Get unique count excluding nulls
        unique_count = series.dropna().nunique()
        print(f"\nTotal unique values (excluding nulls): {unique_count}")
        print(f"Number of null values: {series.isnull().sum()}")
        print("-" * 50)
    except Exception as e:
        print(f"\nError processing column {col}: {str(e)}")
        print("-" * 50)


Columns containing 'country':


Column: country
Top 5 most common values and their counts:
country
United States               15827
NULL                        15330
India                       14075
United States of America    13283
Germany                     10046
Name: count, dtype: int64

Total unique values (excluding nulls): 235
Number of null values: 15330
--------------------------------------------------

Column: what country do you live in?
Top 5 most common values and their counts:
what country do you live in?
NULL              152992
United States        400
India                166
United Kingdom       143
Germany               85
Name: count, dtype: int64

Total unique values (excluding nulls): 80
Number of null values: 152992
--------------------------------------------------

Column: what country or region do you live in?
Top 5 most common values and their counts:
what country or region do you live in?
NULL                        150762
United States of America      1

In [21]:
# Dictionary of column groups to combine
column_groups = {
    'yearscode': ['yearscode', 'yearscodepro', 'yearsprogram'],
    'education': ['edlevel', 'education', 'formaleducation'],
    'employment': ['employment', 'employmentstatus', 'employment_status'],
    'companysize': ['companysize', 'orgsize', 'companyemployeesrange'],
    'salary': ['convertedsalary', 'convertedcomp', 'comptotal'],
    'jobsatisfaction': ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction'],
    'jobtitle': ['jobtitle', 'devtype', 'currentjobtitle', 'developertype'],
    'industry': ['industry', 'industrytype', 'companytype'],
    'location': ['country', 'location', 'countrycode'],
    'programmingexperience': ['yearscode', 'yearscoding', 'codingexperience'],
    'database': ['databaseworkedwith', 'databasedesirenextyear', 'dbworkedwith', 'dbnextyear'],
    'platform': ['platformworkedwith', 'platformdesirenextyear', 'platformnextyear'],
    'webframework': ['webframeworkworkedwith', 'webframeworkdesirenextyear', 'frameworknextyear'],
    'language': ['languageworkedwith', 'languagedesirenextyear', 'langnextyear'],
    'ide': ['ide', 'developmentenvironment', 'devenvironment'],
    'os': ['opsys', 'operatingsystem', 'os'],
    'devops': ['devopsworkedwith', 'devopsdesirenextyear', 'devopsnextyear'],
    'tools': ['toolstechworkedwith', 'toolstechdesirenextyear', 'toolsnextyear'],
    'methodology': ['methodology', 'devmethodology', 'developmentmethodology'],
    'communication': ['communicationtools', 'collaboration', 'collabtools'],
    'gender': ['gender', 'genderidentity', 'sex'],
    'age': ['age', 'agerange', 'agegrouping'],
    'learning': ['learncode', 'learncodehow', 'learningmethod'],
    'workexp': ['workexp', 'experience', 'yearsexperience'],
    'remote': ['remotework', 'workremote', 'remotestatus'],
    'team': ['teamsize', 'orgteamsize', 'developmentteamsize'],
    'version': ['versioncontrol', 'versioncontrolsystem', 'vcs'],
    'ai': ['aiworkedwith', 'aidesirenextyear', 'ainextyear'],
    'cloud': ['cloudplatformworkedwith', 'cloudplatformdesirenextyear', 'cloudnextyear']
}

In [22]:
df_use_2 = df_use.copy()

In [23]:
# Identify and combine duplicate columns
def combine_duplicate_columns(df):
    """
    Identifies columns with the same name (case-insensitive), combines their data
    into a single column by taking the first non-null value, and removes the originals.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with duplicate columns combined.
    """
    df_combined_duplicates = pd.DataFrame(index=df.index)
    processed_columns = set()

    for col_name in df.columns:
        col_name_lower = col_name.lower()

        if col_name_lower not in processed_columns:
            # Find all columns with this name (case-insensitive)
            duplicate_columns = [col for col in df.columns if col.lower() == col_name_lower]

            if len(duplicate_columns) > 1:
                print(f"Combining duplicate columns for '{col_name_lower}': {duplicate_columns}")
                # Select the duplicate columns
                selected_duplicates = df[duplicate_columns]
                # Combine by taking the first non-null value across rows
                combined_series = selected_duplicates.bfill(axis=1).iloc[:, 0]
                df_combined_duplicates[col_name_lower] = combined_series
                # Add to processed set
                processed_columns.add(col_name_lower)
            else:
                # Not a duplicate, just add the column
                df_combined_duplicates[col_name_lower] = df[col_name]
                processed_columns.add(col_name_lower)

    return df_combined_duplicates

# Apply the function to combine duplicate columns in df_use_2
df_use_combined_duplicates = combine_duplicate_columns(df_use_2)

print("\nOriginal DataFrame shape:", df_use_2.shape)
print("DataFrame shape after combining duplicates:", df_use_combined_duplicates.shape)

Combining duplicate columns for 'country': ['country', 'country']
Combining duplicate columns for 'gender': ['gender', 'gender']
Combining duplicate columns for 'webframeworkedwith': ['webframeworkedwith', 'webframeworkedwith']
Combining duplicate columns for 'webframedesirenextyear': ['webframedesirenextyear', 'webframedesirenextyear']
Combining duplicate columns for 'hobby': ['hobby', 'hobby']
Combining duplicate columns for 'industry': ['industry', 'industry']

Original DataFrame shape: (154521, 729)
DataFrame shape after combining duplicates: (154521, 723)


In [24]:
def combine_columns(df, column_list):
    """
    Combines data from a list of columns into a single Series,
    taking the first non-null value across the columns for each row.

    Args:
        df: The input pandas DataFrame.
        column_list: A list of column names to combine.

    Returns:
        A pandas Series containing the combined data.
    """
    # Select the specified columns
    selected_columns = df[column_list]

    # Combine columns by taking the first non-null value across rows
    combined_series = selected_columns.bfill(axis=1).iloc[:, 0]

    return combined_series

In [25]:
# Initialize an empty dictionary to store consolidated columns
consolidated_columns_dict = {}

# Iterate through the column_groups dictionary
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]

    # If there are columns from the current group present in the DataFrame
    if present_columns:
        print(f"Processing group '{group_name}' with columns: {present_columns}")
        # Call the combine_columns function
        combined_series = combine_columns(df_use_combined_duplicates, present_columns)
        # Store the resulting combined Series in the dictionary
        consolidated_columns_dict[group_name] = combined_series
    else:
        print(f"No columns found for group '{group_name}' in the DataFrame.")

# Create a new DataFrame from the dictionary of consolidated columns
df_consolidated = pd.DataFrame(consolidated_columns_dict)

print("\nConsolidated DataFrame created.")
print(f"Shape of consolidated DataFrame: {df_consolidated.shape}")
df_consolidated.head()

Processing group 'yearscode' with columns: ['yearscode', 'yearscodepro', 'yearsprogram']
Processing group 'education' with columns: ['edlevel', 'education', 'formaleducation']
Processing group 'employment' with columns: ['employment', 'employmentstatus', 'employment_status']
Processing group 'companysize' with columns: ['companysize', 'orgsize']
Processing group 'salary' with columns: ['convertedsalary', 'convertedcomp', 'comptotal']
Processing group 'jobsatisfaction' with columns: ['jobsatisfaction', 'job_satisfaction', 'careersatisfaction']
Processing group 'jobtitle' with columns: ['devtype', 'developertype']
Processing group 'industry' with columns: ['industry', 'companytype']
Processing group 'location' with columns: ['country']
Processing group 'programmingexperience' with columns: ['yearscode', 'yearscoding']
Processing group 'database' with columns: ['databaseworkedwith', 'databasedesirenextyear']
Processing group 'platform' with columns: ['platformworkedwith', 'platformdesiren

Unnamed: 0,yearscode,education,employment,companysize,salary,jobsatisfaction,jobtitle,industry,location,programmingexperience,database,platform,language,ide,os,methodology,communication,gender,age,learning,workexp,remote,version
0,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,


In [26]:
# Create a list of all original column names that were combined
original_columns_to_drop = []
for group_name, column_list in column_groups.items():
    # Identify columns in df_use_combined_duplicates that are present in the current group's list
    present_columns = [col for col in column_list if col in df_use_combined_duplicates.columns]
    original_columns_to_drop.extend(present_columns)

# Remove duplicates from the list of columns to drop
original_columns_to_drop = list(set(original_columns_to_drop))

# Drop these original columns from the df_use_combined_duplicates DataFrame
df_use_combined_duplicates_dropped = df_use_combined_duplicates.drop(columns=original_columns_to_drop, errors='ignore')

print("Original columns that were combined have been dropped.")
print(f"Shape of df_use_combined_duplicates after dropping original columns: {df_use_combined_duplicates_dropped.shape}")

Original columns that were combined have been dropped.
Shape of df_use_combined_duplicates after dropping original columns: (154521, 681)


In [27]:
# Concatenate the remaining columns from df_use_combined_duplicates_dropped with the consolidated columns
df_final = pd.concat([df_use_combined_duplicates_dropped, df_consolidated], axis=1)

print("\nFinal DataFrame created by concatenating remaining original columns and consolidated columns.")
print(f"Shape of the final DataFrame: {df_final.shape}")

# Display the first few rows of the final dataframe
print("\nFirst 5 rows of the final DataFrame:")
display(df_final.head())


Final DataFrame created by concatenating remaining original columns and consolidated columns.
Shape of the final DataFrame: (154521, 704)

First 5 rows of the final DataFrame:


Unnamed: 0,toolcountpersonal,newcollabtoolshaveworkedwith,techendorse_13_text,numbermonitors,vchostingprofessional use,select up to 3 (appealing message traits: message is personalized),jobemailpriorities3,what advertisers do you remember seeing on stack overflow? (open-ended response),select all that apply (future lang & tech: android),influenceviztools,truefalse_3,aidangerous,techoppose_16,lastnewjob,ainextvery similar,officestackasyncadmired,jobcontactpriorities2,blockchain,aimodelschoice,importanthiringgettingthingsdone,student,assessjobexp,agree_notice,newonboardgood,equipmentsatisfiedrw,opensourcer,assessbenefits6,un_subregion,otherpeoplescode,stackoverflowconsidermember,accessibility,age_range,adsagreedisagree3,so_actions_3,what other departments / roles do you interact with regularly? (system administrators),aiagentextwrite,"in receiving an email about a job opportunity, what attributes of the message would make you more likely to respond? (message is personalized to me)",sohowmuchtime,jobsatpoints_6,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (i can recommend or request products),currencydesc,select up to 3 (most important aspect of new job opportunity: salary),frameworkdesirenextyear,occupation_group,airesponsible,sexuality,aitoolcurrently using,adspriorities1,aiagents,aiagentexternal,surveytoolong,agentusesgeneral,blockchainorg,ainextmore integrated,homeremote,coderevhrs,timesearching,resumeprompted,databasewanttoworkwith,assessjob2,equipmentsatisfiedstorage,questionsinteresting,what country do you live in?,waketime,soaccount,questionsconfusing,assessbenefits1,techdoc,which of the following best describes your occupation?,aiagentimpactsomewhat agree,platformhaveworkedwith,frequency_3,cousineducation,misctechadmired,clickykeys,importanthiringopensource,adspriorities4,soduration,officestackwantentry,so_region,languagechoice,aimodelsadmired,influenceworkstation,surveyeasy,mgrwant,important_companymission,respondent,agree_alcohol,assessjobprofdevel,how large is the team that you work on?,techoppose_9,officestackhaveentry,hypotheticaltools2,seriouswork,assessbenefits4,tech_do,have you visited / are you aware of stack overflow careers 2.0?,adspriorities3,jobsatpoints_5,techendorse_7,aitoolnot interested in using,stackoverflowrecommend,important_wfh,aiagentknowledge,aliens,collector,rightwrongway,assessbenefits9,ainextno change,newcollabtoolsadmired,devenvschoice,stackoverflowjoblisting,rep_range,techoppose_15_text,toolcountwork,platformwantentry,stackoverflowsatisfaction,age_midpoint,stackoverflowfoundanswer,hopefiveyears,assessbenefits5,ainextless integrated,assessbenefits10,unit_testing,raceethnicity,what country or region do you live in?,newcollabtoolsdesirenextyear,dependents,aiopen,adspriorities7,devenviron,jobcontactpriorities1,sofindanswer,careersat,agree_diversity,databasehaveentry,webframechoice,webframewantentry,which us state or territory do you live in?,haveworkedframework,newcollabtoolsworkedwith,newpurplelink,aimodelshaveentry,select all that apply (source control used: git),which of our sites do you frequent most?,aitoolcurrently partially ai,webdevelopertype,officestacksyncwanttoworkwith,jobsatpoints_1,shipit,adblockerdisable,aiagentorchestration,so_dev_content,techendorse_4,sojobs,what type of project are you developing?,aisent,sexualorientation,interview_likelihood,ethicschoice,were you aware of the apptivate contest?,annoyingui,databaseadmired,aifrustration,responseid,assessbenefits2,visit_frequency,buildingthings,icorpm,influenceconsultants,betterlife,educationimportant,agreedisagree3,platformchoice,techoppose_11,"in the last 12 months, how much money have you spent on personal technology-related purchases?",ailearnhow,commplatformwantentr,mainbranch,assessjobdept,officestackasynchaveworkedwith,wantworkdatabase,important_ownoffice,how many years of it/programming experience do you have?,so_actions_15_text,aitooldon't plan to use ai for this task,commplatformhaveworkedwith,commit_frequency,assessjobleaders,majorundergrad,jobsatpoints_13,aiagentimpactsomewhat disagree,surveyease,select all that apply (current lang & tech: android),stackoverflowmoderation,assessjob4,stackoverflowjobsrecommend,which technology products do you own? (you can choose more than one) (iphone),knowledge_8,newsosites,changeworld,learncodeonline,professionalquestion,aiagentknowwrite,languageswantentry,what is your involvement in purchasing? you can choose more than 1. (influencer),select all that apply (most urgent info about job opportunity: salary),"in an average week, how do you spend your time at work? (developing new features)",agreedisagree1,do you have a stack overflow careers profile?,"if you make a software product, how does your company make money? (you can choose more than one) (advertising)",professional,where do you work remotely most of the time?,aiagentchallengesstrongly disagree,sotagshaveentry,stackoverflowmakemoney,workchallenge,sonewcontent,aisearchwanttoworkwith,so_actions_7,tech_want,how many people work for your company?,webframeworkedwith,stackoverflowjobsearch,toolstechadmired,jobemailpriorities5,newdevopsimpt,platformadmired,jobsatpoints_15_text,so_actions_9,mentalhealth,stackoverflowbetter,hobbyist,have you changed jobs in the last 12 months?,webframedesirenextyear,assessjobcommute,entteams,agree_problemsolving,ainextmuch less integrated,how do you prefer to be contacted about job opportunities? (email),truefalse_1,commplatformhaveentr,newedimpt,inthezone,exercise,newpurchaseresearch,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2013?",platformhaveentry,truefalse_2,how would you best describe the industry you work in?,jobfactors,aiagentobservesecure,aithreat,uk_country,importanthiringalgorithms,sotagswanttoworkwith,sopartfreq,aiagentimpactneutral,languageshaveentry,how often do you visit job boards?,welcomechange,languagewanttoworkwith,professionaltech,select all that apply (training & education: no formal training),hobby,select all that apply (why try stack overflow careers: no spam),knowledge_6,adblocker,desktop_os,influencecloud,updatecv,importantbenefits,techoppose_2,adblockerreasons,equipmentsatisfiedram,ethnicity,learncodechoose,employmentaddl,please rate your job/career satisfaction,yearscodedjobpast,devenvsadmired,techendorse_9,metricassess,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2014?",opsysprofessional use,select all that apply (why answer: help a programmer in need),currency,socialmedia,techoppose_7,techendorse_5,commplatformwanttoworkwith,"including bonus, what is your annual compensation in usd?",assessjobdiversity,influenceservers,assessjob7,so_actions_1,devenvwantentry,how do you use stack overflow? (read other people's questions to solve my problems),salary_midpoint,knowledge_9,databasewantentry,aiagentobswrite,select up to 3 (how can companies improve interview process: more live code),have you visited / are you aware of stack overflow careers?,aisearchdevadmired,webframehaveworkedwith,select up to 3 (most annoying about job search: finding time),workpaycare,what is your gender?,how did you find out about your current job?,adspriorities2,so_actions_5,university,how_to_improve_interview_process,ethicsreport,why_stack_overflow,stackoverflowcompanypage,extraversion,aicomplex,investtimetools,check,aiagentchallengesstrongly agree,excoderreturn,itperson,"including yourself, how many developers are employed at your company?",techendorse_6,importanthiringcompanies,soai,coderev,jobcontactpriorities4,developer_challenges,newjobhunt,equipmentsatisfiedcpu,surveyyear,jobsatpoints_14,understandcomputers,workplan,aiben,how often are you contacted by recruiters?,influencerecruitment,influenceinternet,techendorse_2,aitoolinterested in using,knowledge_2,jobsatpoints_11,so_actions_15,excoderactive,mgridiot,assessbenefits7,webframeadmired,assessjob5,jobsearchstatus,surveylong,agreedisagree2,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011? (<$10,000)",purchaseinfluence,techoppose_15,professionalcloud,toolstechwanttoworkwith,stackoverflowcopiedcode,knowledge_7,how likely is it that a recommendation you make will be acted upon?,ainextneither different nor similar,sofriction,please rate the advertising you've seen on stack overflow (the ads are relevant),sovisitfreq,learnedhiring,interestedanswers,salary,surveylength,competepeers,ainextsomewhat similar,excoderbalance,us_state,occupation,sotimesaved,aiagentorchwrite,adsagreedisagree2,"in an average week, how do you spend your time? (developing new features)",embeddedhaveworkedwith,challengemyself,stackoverflowcommunity,problemsolving,platformwanttoworkwith,jobcontactpriorities3,ainextvery different,techoppose_5,age1stcode,women_on_team,dev_environment,diversityimportant,ainextsomewhat different,assessjob10,mgrmoney,techoppose_13,timeanswering,techendorse_13,aidevwanttoworkwith,influencedatabase,assessjob8,excoderwillnotcode,frustration,kinshipdevelopers,stackoverflowvisit,so_actions_4,newcollabtoolswanttoworkwith,aitoolplan to mostly use ai,jobemailpriorities7,enjoydebugging,hackathonreasons,jobseekingstatus,auditoryenvironment,agree_legacy,knowledge_3,aidevhaveworkedwith,experience_midpoint,new_job_value,tabsspaces,techendorse,"please rate how important each of the following characteristics of a company/job offer are to you. please select a maximum of 3 items as ""non-negotiables"" to help us identify the most important items, those where you would never consider a company if they didn't meet them. (high base compensation)",screenname,sohow,programming_ability,adspriorities5,jobsatpoints_15,star_wars_vs_star_trek,wantworkframework,misctechdesirenextyear,purchasewhat,influencecommunication,influencehardware,vchostingpersonal use,devenvswanttoworkwith,aitoolplan to partially use ai,aiagentchallengessomewhat agree,"what is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?",self_identification,stackoverflowadsdistracting,misctechworkedwith,sovisit1st,excoderbelonged,how many developers are employed at your company?,misctechhaveworkedwith,sotagsadmired,assessjob3,buynewtool,what is your current stack overflow reputation?,timefullyproductive,wantworklanguage,stackoverflowhasaccount,stackoverflownewquestion,remote,languageadmired,jobseek,assessjobremote,stackoverflowdevstory,stackoverflowmetachat,important_sameend,jobsatpoints_9,jobemailpriorities6,influencedepttech,do you enjoy working remotely?,race,fizzbuzz,devenvshaveworkedwith,sotagshaveworkedwith,programhobby,databasehaveworkedwith,vcinteraction,techoppose_3,newrole,q120,aifuture,adsagreedisagree1,impsyn,jobsatpoints_7,aihuman,officestackasyncwanttoworkwith,how often do you find solutions to your programming problems on stack overflow without asking a new question?,assessjob1,aitoolcurrently mostly ai,aiagentimpactstrongly agree,ethicsresponsible,wantworkplatform,haveworkedplatform,friendsdevelopers,newjobhuntresearch,languagehaveworkedwith,hoursperweek,excodernotforme,jobsatpoints_16,what operating system do you use the most?,webframewanttoworkwith,socomm,jobsatpoints_4,which best describes the size of your company?,job_discovery,jobprofile,undergradmajor,aisearchdevhaveworkedwith,techendorse_3,which technologies are you excited about? (node.js),officestacksynchaveworkedwith,salarytype,assessjobrole,aisearchhaveworkedwith,ethicalimplications,why_learn_new_tech,assessjobindustry,selftaughttypes,influencetechstack,misctechwanttoworkwith,how would you best describe the industry you currently work in?,aiacc,what types of purchases are you involved in? (hardware),experience_range,assessbenefits3,checkincode,agree_loveboss,codingactivities,pronouncegif,frequency_1,unittests,opsyspersonal use,techendorse_1,equipmentsatisfiedmonitors,assessjob9,mobiledevelopertype,frequency_2,newovertime,workweekhrs,currencysymbol,open_to_new_job,aichallenges,important_variety,jobemailpriorities1,stackoverflowadsrelevant,are you currently looking for a job or open to new opportunities?,jobcontactpriorities5,skipmeals,big_mac_index,assessbenefits11,so_actions_16,boringdetails,techendorseintro,jobemailpriorities4,do you work remotely?,sovisitto,yearscodingprof,trans,aisearchdevwanttoworkwith,resumeupdate,"you answered you don't have a careers profile, can you elaborate why?",assessjob6,toolstechhaveworkedwith,buildvsbuy,assessjoboffice,"if your company has a native mobile app, what platforms do you support? (iphone)",hypotheticaltools3,important_newtech,excoderskills,stackoverflowwhatdo,stackoverflowdescribes,assessjobcompensation,importanthiringpmexp,importanthiringeducation,jobsatpoints_8,which languages are you proficient in? (java),ainextmuch more integrated,team_size_range,overpaid,yearscodedjob,stackoverflowhelpful,compfreq,hoursoutside,onboarding,aiagentchallengessomewhat disagree,assessjobproduct,lastint,jobsat,assessjobtech,importanthiringrep,purchasehow,agree_nightcode,do you have a stack overflow careers 2.0 profile?,newstuck,so_actions_10,embeddedadmired,aiexplain,tbranch,important_buildexisting,stackoverflowparticipate,agree_tech,what best describes your career / job satisfaction?,hypotheticaltools4,aiinteresting,hourscomputer,commplatformadmired,convertedcompyearly,learncodeai,stackoverflowdevices,militaryus,aimodelswanttoworkwith,aiethics,knowledge_1,hypotheticaltools1,opensource,haveworkedlanguage,frameworkworkedwith,webframehaveentry,what is your involvement in purchasing products or services for the company you work for? (you can choose more than one) (influencer),importanthiringtechexp,which of the following languages or technologies have you used significantly in the past year? (c),aimodelshaveworkedwith,eduother,aiagentimpactstrongly disagree,which desktop operating system do you use the most?,haveworkeddatabase,jobemailpriorities2,workloc,company_size_range,techendorse_8,newlearn,aimodelswantentry,officestacksyncadmired,containers,techlist,agree_mars,aiselect,assessbenefits8,how old are you?,salary_range,jobsatpoints_10,adsactions,aiagentchange,agree_adblocker,importanthiringtitles,important_buildnew,stackoverflowanswer,important_control,adspriorities6,databasechoice,learningnewtech,newdevops,collaborateremote,dogs_vs_cats,educationtypes,blockchainis,expectedsalary,offon,embeddedwanttoworkwith,so_actions_6,devenvhaveentry,nondevelopertype,did you participate in the apptivate contest?,stackoverflowjobs,newofftopic,educationparents,ergonomicdevices,knowledge_5,highesteducationparents,important_promotion,job_search_annoyance,workstart,newothercomms,aiagent_uses,difficultcommunication,excoder10years,assessjobprojects,importanthiringcommunication,select all that apply (why use stack overflow: help for job),projectmanagement,timeafterbootcamp,aiagentchallengesneutral,learncodecoursescert,knowledge_4,sotagswant entry,hypotheticaltools5,lasthiredate,jobsecurity,techoppose_1,yearscode,education,employment,companysize,salary.1,jobsatisfaction,jobtitle,industry,location,programmingexperience,database,platform,language,ide,os,methodology,communication,gender,age,learning,workexp,remote.1,version
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Server Programmer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,United States of America,,,,,,,,,,,,,California,,,,,,Stack Overflow,,,,,,,,,,,Web Platform,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,11,,,,,,,,,,,,,,iPhone,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Services,,,,,,,,,,,,,,,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,Unless it's stoopid it gets done,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,Start Up (1-25),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Web Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,"$2,001-$3,000",,,,,,,,11,,,,,,,,,,,,,,iPhone,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,"$40,000 - $60,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,It's been known to happen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,Start Up (1-25),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,40-50,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,South America,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,Enterprise,,,,,,,,,,,,,,,,,,,,"$501-$1,000",,,,,,,,11,,,,,,,,,,,,,,iPhone,,,,,,,,Influencer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Software Products,,,,,,,,,,,,,,,,,,,,,,,,,,,,,So happy it hurts,,,,,,,,,,,,,"$20,000 - $40,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,"<$10,000",,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows 7,,,,Mature Small Business (25-100),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Desktop Application Developer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Asia,,,,,,,,,,,,,,,,,,,Programmers Stack Exchange,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,">$3,000",,,,,,,,41435,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Healthcare,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FML,,,,,,,,,,,,,"$60,000 - $80,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,Not in a million years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Mac OS X,,,,Mid Sized (100-999),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30-34,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Student,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other Europe,,,,,,,,,,,,,,,,,,,Stack Overflow,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,$251-$500,,,,,,,,41310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I enjoy going to work,,,,,,,,,,,,,"<$20,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,,,,,,,,,,,,,,,,,,,,,,,,,,,,Once in a blue moon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Linux,,,,"Other (not working, consultant, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Java,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [28]:
df_final.to_csv('df_final.csv')

In [29]:
combined_consolidated_df = df_final.copy()

In [None]:
#combined_consolidated_df = pd.read_csv("df_final.csv")

In [30]:
# Detailed column stats sorted by non-null count (descending)
col_stats = pd.DataFrame({
    'non_null_count': combined_consolidated_df.notna().sum(),
    'null_count': combined_consolidated_df.isna().sum(),
    'unique_count': combined_consolidated_df.nunique(dropna=True)
}).sort_values('non_null_count', ascending=False)

col_stats.head(50)

Unnamed: 0,non_null_count,null_count,unique_count
surveyyear,154521,0,15
employment,140846,13675,151
location,139191,15330,235
education,137278,17243,554
programmingexperience,116643,37878,126
jobtitle,113405,41116,14745
age,109280,45241,109
yearscode,108160,46361,136
mainbranch,102584,51937,9
companysize,97185,57336,15
