In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import requests
import pandas as pd
import os
import time
import json
import pyarrow.parquet as pq
import xml.etree.ElementTree as ET
# Set your Azure Maps API key
os.environ['AZURE_MAPS_API_KEY'] = 'AD576KfHVdRyigkX4bIyXg0s0nX1amIbJ8qzqwj7P5WhT02ajscjJQQJ99AJACYeBjFXZEq6AAAgAZMP4GmV'  # Replace with your Azure Maps API key
api_key = os.environ.get('AZURE_MAPS_API_KEY')

# Function to fetch country and region from Azure Maps API
def get_location_info(reviewer_location):
    try:
        url = f'https://atlas.microsoft.com/search/address/json?api-version=1.0&query={reviewer_location}&subscription-key={api_key}'
        response = requests.get(url)
        data = response.json()

        # Check if results exist and extract country and region
        if 'results' in data and len(data['results']) > 0:
            country = None
            region = None

            address = data['results'][0].get('address', {})

            country = address.get('country')
            region = address.get('countrySubdivision')  # Corrected to get 'countrySubdivision'

            return country, region
        else:
            print(f"No results found for {reviewer_location}")
            return None, None
    except Exception as e:
        print(f"Error fetching location info for {reviewer_location}: {e}")
        return None, None

# Function to create folder if it doesn't exist
def create_folder_if_not_exist(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

# Function to process CSV files
def process_csv(input_path, output_path):
    df = pd.read_csv(input_path)
    df['country'] = None
    df['region'] = None

    for index, row in df.iterrows():
        reviewer_location = row['reviewer_location']
        if pd.notna(reviewer_location):
            country, region = get_location_info(reviewer_location)
            df.at[index, 'country'] = country
            df.at[index, 'region'] = region
            time.sleep(0.1)  # Throttle requests to avoid rate limits

    df.to_csv(output_path, index=False)
    print(f"Processed {input_path} and saved to {output_path}")

# Function to process JSON files
def process_json(input_path, output_path):
    with open(input_path, 'r') as file:
        data = json.load(file)

    for review in data:
        reviewer_location = review.get('reviewer_location')
        if reviewer_location:
            country, region = get_location_info(reviewer_location)
            review['country'] = country
            review['region'] = region
            time.sleep(0.1)

    with open(output_path, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"Processed {input_path} and saved to {output_path}")

# Function to process Parquet files
def process_parquet(input_path, output_path):
    df = pq.read_table(input_path).to_pandas()
    df['country'] = None
    df['region'] = None

    for index, row in df.iterrows():
        reviewer_location = row['reviewer_location']
        if pd.notna(reviewer_location):
            country, region = get_location_info(reviewer_location)
            df.at[index, 'country'] = country
            df.at[index, 'region'] = region
            time.sleep(0.1)  # Throttle requests to avoid rate limits

    df.to_parquet(output_path, index=False)
    print(f"Processed {input_path} and saved to {output_path}")

# Function to process XML files
def process_xml(input_path, output_path):
    tree = ET.parse(input_path)
    root = tree.getroot()

    for review in root.findall('review'):
        reviewer_location = review.find('reviewer_location').text
        if reviewer_location:
            country, region = get_location_info(reviewer_location)
            country_elem = ET.SubElement(review, 'country')
            region_elem = ET.SubElement(review, 'region')
            country_elem.text = country
            region_elem.text = region
            time.sleep(0.1)  # Throttle requests to avoid rate limits

    tree.write(output_path)
    print(f"Processed {input_path} and saved to {output_path}")

# Function to get the sorted file list by size
def get_sorted_files_by_size(folder_path, file_extension):
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(file_extension)]
    files.sort(key=lambda x: os.path.getsize(x))  # Sort files by size (smallest first)
    return files

# Main function to process files, starting with the smallest
def process_all_files(input_base_path, output_base_path):
    # Create output folders if they don't exist
    create_folder_if_not_exist(output_base_path)

    # Process CSV files
    csv_input_folder = os.path.join(input_base_path, 'csv')
    csv_output_folder = os.path.join(output_base_path, 'csv')
    create_folder_if_not_exist(csv_output_folder)

    csv_files = get_sorted_files_by_size(csv_input_folder, '.csv')
    for input_file in csv_files:
        file_name = os.path.basename(input_file)
        output_file = os.path.join(csv_output_folder, file_name)
        process_csv(input_file, output_file)

    # Process JSON files
    json_input_folder = os.path.join(input_base_path, 'json')
    json_output_folder = os.path.join(output_base_path, 'json')
    create_folder_if_not_exist(json_output_folder)

    json_files = get_sorted_files_by_size(json_input_folder, '.json')
    for input_file in json_files:
        file_name = os.path.basename(input_file)
        output_file = os.path.join(json_output_folder, file_name)
        process_json(input_file, output_file)

    # Process Parquet files
    parquet_input_folder = os.path.join(input_base_path, 'parquet')
    parquet_output_folder = os.path.join(output_base_path, 'parquet')
    create_folder_if_not_exist(parquet_output_folder)

    parquet_files = get_sorted_files_by_size(parquet_input_folder, '.parquet')
    for input_file in parquet_files:
        file_name = os.path.basename(input_file)
        output_file = os.path.join(parquet_output_folder, file_name)
        process_parquet(input_file, output_file)

    # Process XML files
    xml_input_folder = os.path.join(input_base_path, 'xml')
    xml_output_folder = os.path.join(output_base_path, 'xml')
    create_folder_if_not_exist(xml_output_folder)

    xml_files = get_sorted_files_by_size(xml_input_folder, '.xml')
    for input_file in xml_files:
        file_name = os.path.basename(input_file)
        output_file = os.path.join(xml_output_folder, file_name)
        process_xml(input_file, output_file)

# Example usage
input_base_path = '/content/drive/MyDrive/Sentiment_Sifters/data/cleaned_loc'
output_base_path = '/content/drive/MyDrive/Sentiment_Sifters/data/country_loc'
process_all_files(input_base_path, output_base_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
No results found for Chico, CA USA
No results found for Every thing is bigger n better in Texas
No results found for Atlantic Beach, NC
No results found for Brazil
No results found for Los Angeles, CA
No results found for Nowhere worth mentioning
No results found for Clinton, MS USA
No results found for Inwood, New York
No results found for Chicago Suburbs, IL United States
No results found for Texas
No results found for Butler, PA
No results found for Boston, MA USA
No results found for Oceanside, CA, USA
No results found for Spokane, WA
No results found for USA
No results found for New Mexico
No results found for Montana
No results found for Reno, NV
No results found for Los Angeles, CA USA
No results found for Blossburg Pa.
No results found for Seattle, WA
No results found for Illinois USA
No results found for Michigan
No results found for San Diego
No results found for USA
No results found for Iowa USA
No results foun

In [None]:
import pandas as pd
import os

# Define the folder where your CSV files are located
folder_path = '/content/drive/MyDrive/Sentiment_Sifters/data/processed/csv'

# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Check if there are any CSV files in the folder
if len(csv_files) == 0:
    print("No CSV files found in the folder.")
else:
    # Initialize an empty list to store DataFrames
    dataframes = []

    # Loop through each file and append to the list
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        try:
            df = pd.read_csv(file_path)
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Only concatenate if we have dataframes
    if dataframes:
        # Concatenate all DataFrames into one
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Check the number of rows in the combined DataFrame
        row_count = combined_df.shape[0]

        # Display the number of rows
        print(f'Total number of rows in the combined DataFrame: {row_count}')
    else:
        print("No DataFrames to concatenate.")

df.columns

Total number of rows in the combined DataFrame: 63480


Index(['unique_id', 'asin', 'product_name', 'product_type', 'helpful',
       'rating', 'title', 'date', 'reviewer', 'reviewer_location',
       'review_text', 'country', 'region'],
      dtype='object')