In [1]:
import pandas as pd
import spacy
import concurrent.futures
import openai
import json
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [2]:

# Load your dataset
data = pd.read_json("C:\\Users\\chiso\\Downloads\\demo-data-setup-documents\\reduced_file.json")

In [3]:

# Load the spaCy NLP model
nlp = spacy.load("en_core_web_md")

# Function to extract locations using NER
def extract_locations(text):
    doc = nlp(text)  # Process text through spaCy's NLP pipeline
    locations = []
    for ent in doc.ents:
        # Only extract GPE (Geopolitical Entities), and exclude persons (PERSON label)
        if ent.label_ == "GPE" and ent.label_ != "PERSON":
            locations.append(ent.text)
    return locations

# Function to process locations in parallel
def extract_locations_parallel(df, column_name):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Apply the extract_locations function in parallel for the specified column
        result = list(executor.map(extract_locations, df[column_name]))
    return result

# Apply location extraction to the article column
data['article_locations'] = data['article'].apply(extract_locations)

# Check extracted locations
data[['title', 'article_locations']].head()

Unnamed: 0,title,article_locations
0,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
1,Nebraska officials again sound alarm on human ...,"[Nebraska, Nebraska, Nebraska, Omaha, Nebraska..."
2,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
3,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
4,How Texans Can Help Fight Human Trafficking Ac...,"[Dallas, Texas, Texas]"


In [4]:

# Initialize GeoPy geocoder
geolocator = Nominatim(user_agent="myApp", timeout=10)

from geopy.exc import GeocoderTimedOut

# Mapping countries to English
country_mapping = {
    'پاکستان': 'Pakistan',
    'دولة الكويت': 'Kuwait',
    'روما': 'Romania',
    'ليبيا': 'Libya',
    'اليونان': 'Greece',
    # Add more translations here as necessary
}

def get_country_from_city(city):
    try:
        location = geolocator.geocode(city)
        if location:
            country = location.address.split(",")[-1].strip()
            # Translate country to English if it's in a non-English script
            return country_mapping.get(country, country)  # Default to country if no mapping exists
        else:
            return None
    except GeocoderTimedOut:
        print(f"Timeout error occurred while geocoding {city}. Retrying...")
        return get_country_from_city(city)  # Retry the request
    except Exception as e:
        print(f"Error resolving location {city}: {e}")
        return None


In [6]:
# Function to check if a location is valid (city/country) and return with country
def get_location_with_country(location):
    try:
        country = get_country_from_city(location)
        if country:
            return {"city": location, "country": country}
        else:
            return None
    except Exception:
        return None

In [7]:
# Function to filter out non-geographical locations (cities and countries only)
def filter_geographical_locations(locations):
    if locations is None:
        return {"start": [], "intermediate": [], "end": []}

    filtered_locations = {"start": [], "intermediate": [], "end": []}

    for category in ['start', 'intermediate', 'end']:
        for location in locations.get(category, []):
            location_with_country = get_location_with_country(location)
            if location_with_country:
                filtered_locations[category].append(location_with_country)

    return filtered_locations


In [None]:
# Initialize OpenAI API
openai.api_key = "Add urs"

# Refined function to extract traffic flow using ChatGPT with better contextual understanding
def extract_traffic_flow_chatgpt_with_context(article):
    prompt = f"""
    Given the following article, classify the locations into start, intermediate, and end locations. Distinguish between locations where victims were actually trafficked and locations that were only promised. Exclude non-geographical locations, such as organizations, departments, or unspecified places.

    Article: "{article}"

    Please answer the following:
    1. Classify actual trafficking flow into 'start', 'intermediate', and 'end' locations.
    2. If a location is mentioned as a promised destination but victims were not actually trafficked to it, exclude it from 'intermediate' and 'end' locations.
    3. Return the output in JSON format with 'start', 'intermediate', and 'end' locations, like this:
    {{
        "start": ["City1"],
        "intermediate": ["City2", "City3"],
        "end": ["City4"]
    }}
    """

    # Send the prompt to OpenAI API (chat model)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Use the latest model
        messages=[
            {"role": "system", "content": "You are a helpful assistant that understands human trafficking flows and can distinguish between actual trafficking routes and promised destinations."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.5,
    )

    # Print the raw response for debugging
    raw_response = response['choices'][0]['message']['content'].strip()
    print(f"Raw response: {raw_response}")

    # Parse the response and handle any errors
    try:
        traffic_flow = json.loads(raw_response)
        return traffic_flow
    except json.JSONDecodeError as e:
        print(f"Error parsing response: {e}")
        return None

In [10]:

# Sample data (replace with actual dataset)
sample_data = pd.DataFrame({
    "title": ["Trafficking Case 1", "Trafficking Case 2", "Trafficking Case 3"],
    "article": [
        "The victim was moved from Paris to Berlin before arriving in Madrid.",
        "Authorities rescued victims who traveled from Johannesburg to Cape Town.",
        "The Federal Investigation Agency (FIA) Gujranwala Zone on Friday apprehended five individuals involved in human trafficking and visa fraud during raids in Gujranwala and Gujrat."
    ]
})

# Apply the ChatGPT-based traffic flow prediction with context
sample_data['traffic_flow'] = sample_data['article'].apply(extract_traffic_flow_chatgpt_with_context)

# Apply the filtering function to the extracted traffic flow
sample_data['filtered_traffic_flow'] = sample_data['traffic_flow'].apply(filter_geographical_locations)

# Display the results
pd.set_option('display.max_colwidth', None)  # Prevent truncation
print(sample_data[['title', 'traffic_flow', 'filtered_traffic_flow']].head())


Raw response: {
    "start": ["Paris"],
    "intermediate": ["Berlin"],
    "end": ["Madrid"]
}
Raw response: {
    "start": ["Johannesburg"],
    "end": ["Cape Town"]
}
Raw response: {
    "start": ["Gujranwala"],
    "intermediate": [],
    "end": ["Gujrat"]
}
                title  \
0  Trafficking Case 1   
1  Trafficking Case 2   
2  Trafficking Case 3   

                                                          traffic_flow  \
0  {'start': ['Paris'], 'intermediate': ['Berlin'], 'end': ['Madrid']}   
1                    {'start': ['Johannesburg'], 'end': ['Cape Town']}   
2     {'start': ['Gujranwala'], 'intermediate': [], 'end': ['Gujrat']}   

                                                                                                                                                   filtered_traffic_flow  
0  {'start': [{'city': 'Paris', 'country': 'France'}], 'intermediate': [{'city': 'Berlin', 'country': 'Deutschland'}], 'end': [{'city': 'Madrid', 'country': 'España'}]}

In [None]:
# Apply the ChatGPT-based traffic flow prediction
data['traffic_flow'] = data['article'].apply(extract_traffic_flow_chatgpt)

# Apply the filtering function to the extracted traffic flow
data['filtered_traffic_flow'] = data['traffic_flow'].apply(filter_geographical_locations)

# Display the results
pd.set_option('display.max_colwidth', None)  # Prevent truncation
print(data[['title', 'filtered_traffic_flow']].head())


In [None]:
import openai
import json
import pandas as pd
from geopy.geocoders import Nominatim

# OpenAI API Key
openai.api_key = "your-openai-api-key"

# Initialize GeoPy geocoder
geolocator = Nominatim(user_agent="myApp", timeout=10)

# Function to get the country from a city using GeoPy
def get_country_from_city(city):
    try:
        location = geolocator.geocode(city)
        if location:
            country = location.address.split(",")[-1].strip()
            return country
        else:
            return None
    except Exception as e:
        print(f"Error resolving location {city}: {e}")
        return None

# Function to get traffic flow using ChatGPT
def extract_traffic_flow_chatgpt_with_country(article):
    prompt = f"""
    Given the following article, classify the locations into start, intermediate, and end locations.
    Only include valid geographical locations such as cities and countries. Exclude any non-geographical terms or specific addresses.
    Ensure the output includes both the city names and their corresponding countries.

    Article: "{article}"

    Example output format:
    {{
        "start": [{"city": "City1", "country": "Country1"}],
        "intermediate": [{"city": "City2", "country": "Country2"}],
        "end": [{"city": "City3", "country": "Country3"}]
    }}
    """

    # Send the prompt to OpenAI API (chat model)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.5,
    )

    # Get and clean the raw response
    raw_response = response['choices'][0]['message']['content'].strip()
    try:
        traffic_flow = json.loads(raw_response)
        return traffic_flow
    except json.JSONDecodeError:
        print(f"Error parsing response: {raw_response}")
        return None

# Sample dataset
sample_data = pd.DataFrame({
    "title": ["Trafficking Case 1", "Trafficking Case 2", "Trafficking Case 3"],
    "article": [
        "The victim was moved from Paris to Berlin before arriving in Madrid.",
        "Authorities rescued victims who traveled from Johannesburg to Cape Town.",
        "The Federal Investigation Agency (FIA) Gujranwala Zone on Friday apprehended five individuals involved in human trafficking and visa fraud during raids in Gujranwala and Gujrat."
    ]
})

# Apply the ChatGPT-based traffic flow prediction with context
sample_data['traffic_flow'] = sample_data['article'].apply(extract_traffic_flow_chatgpt_with_country)

# Apply the filtering function to the extracted traffic flow
sample_data['filtered_traffic_flow'] = sample_data['traffic_flow'].apply(filter_geographical_locations_with_country)

# Display the results
print(sample_data[['title', 'traffic_flow', 'filtered_traffic_flow']].head())
