In [2]:
import os
import pandas as pd


In [None]:

def extract_column_to_csv(excel_file_path, column_name, output_csv_path):
    """
    Extract a specific column from an Excel file and save it as a CSV file.
    
    Parameters:
    - excel_file_path: Path to the Excel file
    - column_name: The name of the column to extract
    - output_csv_path: Path to save the extracted column as a CSV file
    """
    try:
        # Read the Excel file
        sheet_data = pd.read_excel(excel_file_path)
        
        # Check if the column exists in the sheet
        if column_name in sheet_data.columns:
            # Extract the column and save it to CSV
            sheet_data[[column_name]].dropna().to_csv(output_csv_path, index=False)
            print(f"Column '{column_name}' has been successfully extracted to CSV.")
        else:
            print(f"Column '{column_name}' not found in the Excel file.")
    except Exception as e:
        print(f"Error: {e}")

# Example usage with absolute path
excel_file_path = r'CPCB/Producer_Registered.xlsx'  # Adjust with the correct path
column_name = 'Name'  # Replace with the column name you want to extract
output_csv_path = 'output_file.csv'  # Path to save the output CSV file

extract_column_to_csv(excel_file_path, column_name, output_csv_path)

Column 'Name' has been successfully extracted to CSV.


In [None]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file):
    """
    Merge all CSV files from a specified folder into a single CSV file.
    
    Parameters:
    - input_folder: Path to the folder containing the CSV files
    - output_file: Path to save the merged CSV file
    """
    try:
        # List to hold all the DataFrames
        all_csv_data = []

        # Loop through all files in the specified folder
        for file in os.listdir(input_folder):
            if file.endswith('.csv'):
                file_path = os.path.join(input_folder, file)
                # Read each CSV file and append its content to the list
                data = pd.read_csv(file_path)
                all_csv_data.append(data)

        # Concatenate all CSVs into a single DataFrame
        merged_data = pd.concat(all_csv_data, ignore_index=True)

        # Save the merged DataFrame to a new CSV file
        merged_data.to_csv(output_file, index=False)
        print(f"Merged CSV file saved at: {output_file}")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
input_folder = r'Cpcb_csv'  # Replace with the folder containing your CSV files
output_file = r'D:\match_data\merged_output.csv'  # Path to save the merged CSV file

merge_csv_files(input_folder, output_file)


Merged CSV file saved at: D:\match_data\merged_output.csv


In [26]:

def extract_column_to_csv(excel_file_path, column_name, output_csv_path):
    """
    Extract a specific column from an Excel file and save it as a CSV file.
    
    Parameters:
    - excel_file_path: Path to the Excel file
    - column_name: The name of the column to extract
    - output_csv_path: Path to save the extracted column as a CSV file
    """
    try:
        # Read the Excel file
        sheet_data = pd.read_excel(excel_file_path)
        
        # Check if the column exists in the sheet
        if column_name in sheet_data.columns:
            # Extract the column and save it to CSV
            sheet_data[[column_name]].dropna().to_csv(output_csv_path, index=False)
            print(f"Column '{column_name}' has been successfully extracted to CSV.")
        else:
            print(f"Column '{column_name}' not found in the Excel file.")
    except Exception as e:
        print(f"Error: {e}")

# Example usage with absolute path
excel_file_path = r'MPCB/MPCB_Recycler.xlsx'  # Adjust with the correct path
column_name = 'Name'  # Replace with the column name you want to extract
output_csv_path = 'output_file.csv'  # Path to save the output CSV file

extract_column_to_csv(excel_file_path, column_name, output_csv_path)

Column 'Name' has been successfully extracted to CSV.


In [29]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file):
    """
    Merge all CSV files from a specified folder into a single CSV file.
    
    Parameters:
    - input_folder: Path to the folder containing the CSV files
    - output_file: Path to save the merged CSV file
    """
    try:
        # List to hold all the DataFrames
        all_csv_data = []

        # Loop through all files in the specified folder
        for file in os.listdir(input_folder):
            if file.endswith('.csv'):
                file_path = os.path.join(input_folder, file)
                # Read each CSV file and append its content to the list
                data = pd.read_csv(file_path)
                all_csv_data.append(data)

        # Concatenate all CSVs into a single DataFrame
        merged_data = pd.concat(all_csv_data, ignore_index=True)

        # Save the merged DataFrame to a new CSV file
        merged_data.to_csv(output_file, index=False)
        print(f"Merged CSV file saved at: {output_file}")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
input_folder = r'Mpcb_csv'  # Replace with the folder containing your CSV files
output_file = r'D:\match_data\merged_output_1.csv'  # Path to save the merged CSV file

merge_csv_files(input_folder, output_file)


Merged CSV file saved at: D:\match_data\merged_output_1.csv


In [28]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file, column_name='Name'):
    """
    Merge all CSV files from a specified folder into a single CSV file with one column named 'Name'.
    
    Parameters:
    - input_folder: Path to the folder containing the CSV files
    - output_file: Path to save the merged CSV file
    - column_name: The name of the column to extract (default is 'Name')
    """
    try:
        # List to hold all the company names under 'Name' column
        all_data = []

        # Loop through all files in the specified folder
        for file in os.listdir(input_folder):
            if file.endswith('.csv'):
                file_path = os.path.join(input_folder, file)
                # Read each CSV file
                data = pd.read_csv(file_path)
                
                # Ensure the 'column_name' exists in the CSV, then extract the data
                if column_name in data.columns:
                    all_data.extend(data[column_name].dropna().tolist())  # Add the data to the list

        # Create a DataFrame from the list of all data
        merged_data = pd.DataFrame({column_name: all_data})

        # Save the merged DataFrame to a new CSV file
        merged_data.to_csv(output_file, index=False)
        print(f"Merged CSV file saved at: {output_file}")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
input_folder = r'Mpcb_csv'  # Replace with the folder containing your CSV files
output_file = r'D:\match_data\merged_output_1.csv'  # Path to save the merged CSV file

merge_csv_files(input_folder, output_file)


Merged CSV file saved at: D:\match_data\merged_output_1.csv


In [30]:
import pandas as pd

def compare_csv_files(file1_path, file2_path, column_name='Name'):
    """
    Compare two CSV files and print names that are in one file but not in the other.
    
    Parameters:
    - file1_path: Path to the first CSV file
    - file2_path: Path to the second CSV file
    - column_name: The name of the column to compare (default is 'Name')
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_csv(file1_path)
        df2 = pd.read_csv(file2_path)

        # Check if the column exists in both files
        if column_name not in df1.columns or column_name not in df2.columns:
            print(f"Column '{column_name}' not found in one of the files.")
            return
        
        # Extract the 'Name' column and drop any NaN values
        names_file1 = set(df1[column_name].dropna().str.strip().str.lower())  # Normalize to lower case
        names_file2 = set(df2[column_name].dropna().str.strip().str.lower())  # Normalize to lower case

        # Compare and find names that are in file1 but not in file2, and vice versa
        unique_to_file1 = names_file1 - names_file2
        unique_to_file2 = names_file2 - names_file1

        # Print the results
        if unique_to_file1:
            print(f"Names in {file1_path} but not in {file2_path}:")
            print("\n".join(unique_to_file1))
        else:
            print(f"No names found in {file1_path} but not in {file2_path}.")

        if unique_to_file2:
            print(f"\nNames in {file2_path} but not in {file1_path}:")
            print("\n".join(unique_to_file2))
        else:
            print(f"No names found in {file2_path} but not in {file1_path}.")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
file1_path = r'merged_output_1.csv'  # Path to the first CSV file
file2_path = r'merged_output.csv'  # Path to the second CSV file

compare_csv_files(file1_path, file2_path)


Names in merged_output_1.csv but not in merged_output.csv:
threco recycling llp
habib enterprises
resgen limited
earth care solutions
shakti corporation
swastik plasto
agarwal technoplast pvt. ltd.
sky traders
greentech recyclers
national enterprises
swanyog industrie pvt ltd
pur o fuel pvt ltd
sana traders
deluxe recycling (india).pvt.ltd
s h enterprises
heramb polyflakes
p.k.enterprises
foram enterprises
huhtamaki foundation
uma plastic
sageer ahamad shabber khan
national plastic udyog
k. k. nag pvt. ltd
gayatri packaging
nida plastics
universal waste management
m k plastics
vista packaging private limited
shree plastics industries
shree bhavani engineering works
rudra blue planet environmental solutions india ltd
kalp multipack private limited
aqsa stamping
m k scrap traders
gemcorp recycling & technologies pvt. ltd.
cemech engineering industries
polythene indusries
m/s. mustafa traders pvt. ltd.
suyash industries
liberty polymers
navbharat recycling industries
blue star drum suppli

In [32]:
import pandas as pd
import re

def clean_company_name(name):
    """
    Clean the company name by removing any address or extra details after the first comma.
    Assumes that the company name is the first part of the string before any commas or line breaks.
    
    Parameters:
    - name: The raw company name and address string
    
    Returns:
    - Cleaned company name
    """
    if pd.isna(name):
        return ""
    # Remove everything after the first comma (assuming the name comes before the comma)
    cleaned_name = name.split(',')[0]
    # Optionally, you can add further processing to remove unwanted characters, like newline characters
    return cleaned_name.strip()

def extract_and_clean_name_column(file_path, output_file):
    """
    Extract and clean the 'Name & Address' columns and save the cleaned company names to a new CSV.
    
    Parameters:
    - file_path: Path to the input CSV file
    - output_file: Path to save the cleaned output CSV file
    """
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Columns to clean
        name_columns = ['Name & Address', 'NAME & ADDRESS', 'Name']
        
        # Check if any of the columns exist
        for col in name_columns:
            if col in df.columns:
                # Clean the data in the column
                df[col] = df[col].apply(clean_company_name)
        
        # Save the cleaned company names to the output CSV
        df.to_csv(output_file, index=False)
        print(f"Cleaned data saved to: {output_file}")
    
    except Exception as e:
        print(f"Error: {e}")

# Example usage
input_file = 'merged_output_1.csv'  # Replace with your input CSV file path
output_file = 'cleaned_output.csv'  # Path to save the cleaned output CSV file

extract_and_clean_name_column(input_file, output_file)


Cleaned data saved to: cleaned_output.csv


In [36]:
import pandas as pd
import re

def clean_company_name(name):
    """
    Clean the company name by removing any address or extra details after the first comma or keywords like 'Plot No', 'Survey No', etc.
    
    Parameters:
    - name: The raw company name and address string
    
    Returns:
    - Cleaned company name
    """
    if pd.isna(name):
        return ""
    
    # Remove text starting from 'Plot No', 'Survey No', 'Gat No', or any commas
    name = re.sub(r'[,|.].*', '', name)  # Remove everything after the first comma or period
    name = re.sub(r'Plot No.*', '', name)  # Remove anything after 'Plot No'
    name = re.sub(r'Survey No.*', '', name)  # Remove anything after 'Survey No'
    name = re.sub(r'Gat No.*', '', name)  # Remove anything after 'Gat No'
    
    # Clean up any extra spaces and return the cleaned name
    return name.strip()

def merge_and_clean_name_columns(input_file, output_file):
    """
    Merge the 'Name & Address', 'NAME & ADDRESS', 'Name' columns into one 'Name' column
    and clean the company names by removing any address or extra details.
    
    Parameters:
    - input_file: Path to the input CSV file
    - output_file: Path to save the cleaned output CSV file
    """
    try:
        # Read the CSV file
        df = pd.read_csv(input_file)
        
        # Columns to merge
        name_columns = ['Name & Address', 'NAME & ADDRESS', 'Name']
        
        # Merge the columns into one 'Name' column by prioritizing non-null values
        df['Name'] = df[name_columns].apply(lambda row: next((x for x in row if pd.notna(x)), ""), axis=1)
        
        # Clean the 'Name' column by removing addresses or extra details
        df['Name'] = df['Name'].apply(clean_company_name)
        
        # Optionally, drop the original columns if not needed
        df.drop(columns=name_columns, inplace=True)
        
        # Save the cleaned data with merged 'Name' column to a new CSV file
        df.to_csv(output_file, index=False)
        print(f"Cleaned and merged data saved to: {output_file}")
    
    except Exception as e:
        print(f"Error: {e}")

# Example usage
input_file = 'cleaned_output.csv'  # Replace with your input CSV file path
output_file = 'cleaned_merged_output.csv'  # Path to save the cleaned and merged output CSV file

merge_and_clean_name_columns(input_file, output_file)


Cleaned and merged data saved to: cleaned_merged_output.csv


In [45]:
import pandas as pd

def compare_csv_files(file1_path, file2_path, column_name='Name', output_file='comparison_result.csv'):
    """
    Compare two CSV files and save names that are in one file but not in the other into a CSV file.
    
    Parameters:
    - file1_path: Path to the first CSV file
    - file2_path: Path to the second CSV file
    - column_name: The name of the column to compare (default is 'Name')
    - output_file: The path to save the comparison results (default is 'comparison_result.csv')
    """
    try:
        # Read the CSV files into DataFrames
        df1 = pd.read_csv(file1_path)
        df2 = pd.read_csv(file2_path)

        # Check if the column exists in both files
        if column_name not in df1.columns or column_name not in df2.columns:
            print(f"Column '{column_name}' not found in one of the files.")
            return
        
        # Extract the 'Name' column and drop any NaN values
        names_file1 = set(df1[column_name].dropna().str.strip().str.lower())  # Normalize to lower case
        names_file2 = set(df2[column_name].dropna().str.strip().str.lower())  # Normalize to lower case

        # Compare and find names that are in file1 but not in file2, and vice versa
        unique_to_file1 = names_file1 - names_file2
        unique_to_file2 = names_file2 - names_file1

        # Convert sets to lists
        unique_to_file1 = list(unique_to_file1)
        unique_to_file2 = list(unique_to_file2)

        # Adjust the lengths of both lists by padding the shorter one with None
        max_length = max(len(unique_to_file1), len(unique_to_file2))
        unique_to_file1.extend([None] * (max_length - len(unique_to_file1)))  # Pad with None
        unique_to_file2.extend([None] * (max_length - len(unique_to_file2)))  # Pad with None

        # Prepare the result DataFrame
        result_data = {
            'Names in File1 but not in File2': unique_to_file1
        }

        result_df = pd.DataFrame(result_data)

        # Save the results into a CSV file
        result_df.to_csv(output_file, index=False)
        print(f"Comparison results saved to {output_file}")

    except Exception as e:
        print(f"Error: {e}")

# Example usage
file1_path = 'Mpcb_companies.csv'  # Path to the first CSV file
file2_path = 'Cpcb_companies.csv'  # Path to the second CSV file
output_file = 'comparison_result.csv'  # Output file path

compare_csv_files(file1_path, file2_path, output_file=output_file)


Comparison results saved to comparison_result.csv
