In [1]:
import pandas as pd
import spacy
import concurrent.futures
import openai
import json
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [3]:

# Load your dataset
data = pd.read_json("C:\\Users\\chiso\\Downloads\\demo-data-setup-documents\\reduced_file.json")

In [5]:

# Load the spaCy NLP model
nlp = spacy.load("en_core_web_md")

# Function to extract locations using NER
def extract_locations(text):
    doc = nlp(text)  # Process text through spaCy's NLP pipeline
    locations = []
    for ent in doc.ents:
        # Only extract GPE (Geopolitical Entities), and exclude persons (PERSON label)
        if ent.label_ == "GPE" and ent.label_ != "PERSON":
            locations.append(ent.text)
    return locations
# Apply location extraction to the article column
data['article_locations'] = data['article'].apply(extract_locations)

# Check extracted locations
data[['title', 'article_locations']].head()

Unnamed: 0,title,article_locations
0,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
1,Nebraska officials again sound alarm on human ...,"[Nebraska, Nebraska, Nebraska, Omaha, Nebraska..."
2,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
3,Gang of five convicted of human trafficking an...,"[Glasgow, Romania, Scotland, Scotland, Scotland]"
4,How Texans Can Help Fight Human Trafficking Ac...,"[Dallas, Texas, Texas]"


In [6]:

# Initialize GeoPy geocoder
geolocator = Nominatim(user_agent="myApp", timeout=10)

from geopy.exc import GeocoderTimedOut

# Mapping countries to English
country_mapping = {
    'پاکستان': 'Pakistan',
    'الكويت': 'Kuwait',
    'روما': 'Romania',
    'ليبيا': 'Libya',
    'اليونان': 'Greece',
    # Add more translations here as necessary
}

def get_country_from_city(city, retries=3):
    for attempt in range(retries):
        try:
            location = geolocator.geocode(city)
            if location:
                country = location.address.split(",")[-1].strip()
                return country_mapping.get(country, country)  # Ensure English names
        except GeocoderTimedOut:
            print(f"Timeout while geocoding {city}, retrying {attempt+1}/{retries}...")
    return None  # Return None after max retries


In [7]:
# Function to check if a location is valid (city/country) and return with country
def get_location_with_country(location):
    try:
        country = get_country_from_city(location)
        if country:
            return {"city": location, "country": country}
        else:
            return None
    except Exception:
        return None

In [8]:
def filter_geographical_locations(locations):
    if not locations:  # Handle None values
        return {"Source": [], "Transit": [], "Destination": []}

    filtered = {"Source": [], "Transit": [], "Destination": []}
    for category in ['Source', 'Transit', 'Destination']:
        for loc in locations.get(category, []):
            loc_with_country = get_location_with_country(loc)
            if loc_with_country:
                filtered[category].append(loc_with_country)
    return filtered

In [19]:
# Initialize OpenAI API (ensure you've set your OpenAI API key)
openai.api_key = "Add urs"  # Make sure to replace with your actual API key


def extract_traffic_flow_chatgpt_with_context(article):
    prompt = f"""
    Given the following article, classify locations into Source, Transit, and Destination categories:
    Article: {article}
    Locations Classification:
    - Source: The origin location where the trafficking began.
    - Transit: Locations where the victim was moved through.
    - Destination: The final location where the victim arrived.
    Please return the locations in the format:
    {{
        "Source": ["Location1", "Location2"],
        "Transit": ["Location3", "Location4"],
        "Destination": ["Location5"]
    }}
    """
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{
                "role": "system", "content": "You are a helpful assistant that understands human trafficking flows."
            }, {
                "role": "user", "content": prompt
            }],
            max_tokens=150,
            temperature=0.5,
        )

        raw_response = response['choices'][0]['message']['content'].strip()
        print(f"Raw response: {raw_response}")  # Debugging

        # Ensure response is in JSON format
        if raw_response.startswith("```json"):
            raw_response = raw_response.strip("```json").strip("```")

        # Attempt JSON parsing
        try:
            traffic_flow = json.loads(raw_response)
        except json.JSONDecodeError:
            print("Warning: Failed to parse JSON, returning empty dictionary.")
            return {"Source": [], "Transit": [], "Destination": []}

        return traffic_flow

    except Exception as e:
        print(f"Error: {e}")
        return {"Source": [], "Transit": [], "Destination": []}


# Sample dataset (replace with actual dataset)
sample_data = pd.DataFrame({
    "title": ["Trafficking Case 1", "Trafficking Case 2", "Trafficking Case 3"],
    "article": [
        "The victim was moved from Paris to Berlin before arriving in Madrid.",
        "Authorities rescued victims who traveled from Johannesburg to Cape Town.",
        "The Federal Investigation Agency (FIA) Gujranwala Zone on Friday apprehended five individuals involved in human trafficking and visa fraud during raids in Gujranwala and Gujrat."
    ]
})

# Apply the ChatGPT-based traffic flow prediction with context
sample_data['traffic_flow'] = sample_data['article'].apply(extract_traffic_flow_chatgpt_with_context)

# Apply the filtering function to the extracted traffic flow
sample_data['filtered_traffic_flow'] = sample_data['traffic_flow'].apply(lambda x: filter_geographical_locations(x) if x is not None else {"start": [], "intermediate": [], "end": []})

# Display the results
pd.set_option('display.max_colwidth', None)  # Prevent truncation
print(sample_data[['title', 'traffic_flow', 'filtered_traffic_flow']].head())


Raw response: {
        "Source": ["Paris"],
        "Transit": ["Berlin"],
        "Destination": ["Madrid"]
    }
Raw response: {
    "Source": ["Johannesburg"],
    "Transit": [],
    "Destination": ["Cape Town"]
}
Raw response: {
    "Source": ["Gujranwala", "Gujrat"],
    "Transit": [],
    "Destination": []
}
                title  \
0  Trafficking Case 1   
1  Trafficking Case 2   
2  Trafficking Case 3   

                                                                traffic_flow  \
0    {'Source': ['Paris'], 'Transit': ['Berlin'], 'Destination': ['Madrid']}   
1  {'Source': ['Johannesburg'], 'Transit': [], 'Destination': ['Cape Town']}   
2     {'Source': ['Gujranwala', 'Gujrat'], 'Transit': [], 'Destination': []}   

                                                                                                                                                       filtered_traffic_flow  
0  {'Source': [{'city': 'Paris', 'country': 'France'}], 'Transit': [{'city': 'Berlin',

In [21]:
# Apply the ChatGPT-based traffic flow prediction with context
data['traffic_flow'] = data['article'].apply(extract_traffic_flow_chatgpt_with_context)

# Apply the filtering function to the extracted traffic flow
data['filtered_traffic_flow'] = data['traffic_flow'].apply(lambda x: filter_geographical_locations(x) if x is not None else {"start": [], "intermediate": [], "end": []})

# Display the results
pd.set_option('display.max_colwidth', None)  # Prevent truncation
print(data[['title', 'traffic_flow', 'filtered_traffic_flow']].head())

Raw response: {
    "Source": ["Romania"],
    "Transit": [],
    "Destination": ["Scotland", "Dundee"]
}
Raw response: {
    "Source": ["Nebraska"],
    "Transit": [],
    "Destination": []
}
Raw response: {
    "Source": ["Romania"],
    "Transit": [],
    "Destination": ["Scotland", "Dundee"]
}
Raw response: {
    "Source": ["Romania"],
    "Transit": [],
    "Destination": ["Scotland", "Dundee"]
}
Raw response: {
    "Source": ["Texas"],
    "Transit": [],
    "Destination": ["Dallas"]
}
Raw response: {
    "Source": ["Ethiopia"],
    "Transit": [],
    "Destination": ["South Africa", "Johannesburg"]
}
Raw response: {
    "Source": ["Ethiopia"],
    "Transit": [],
    "Destination": ["Sandringham, Johannesburg"]
}
Raw response: {
    "Source": [],
    "Transit": ["Omaha", "Nebraska"],
    "Destination": ["Capitol"]
}
Raw response: {
    "Source": ["Romania"],
    "Transit": ["Tayside area", "Dundee"],
    "Destination": ["Scotland"]
}
Raw response: {
    "Source": ["Romania"],
    