In [None]:
from collections import OrderedDict

weap_ip_adresses = []
non_weap_ip_adresses = []
for ind, row in df.iterrows():
    if row['is_anon'] == True and row['weaponised'] == 'Weaponised':
        weap_ip_adresses.append(row['user'])
    elif row['is_anon'] == True and row['weaponised'] == 'Not Weaponised':
        non_weap_ip_adresses.append(row['user'])

unique_weap_ip_adresses = list(OrderedDict.fromkeys(weap_ip_adresses))
unique_non_weap_ip_adresses = list(OrderedDict.fromkeys(non_weap_ip_adresses))

In [4]:
#!/usr/bin/env python3
"""
ip_locations_average_with_proxy_map.py

- Use ip-api.com batch endpoint to geolocate IP addresses (max 100 per request)
- Add proxy info (True/False)
- Compute spherical geographic centroid
- Save results to CSV
- Generate interactive Folium map of all IP locations

Requirements:
    pip install requests pandas folium
"""

import math
import folium
from typing import List, Dict, Any
import requests
import time
import pandas as pd


BATCH_URL = "http://ip-api.com/batch"   # free tier, HTTP only
HEADERS = {
    "User-Agent": "DH_Project/1.0 (https://www.epfl.ch/labs/dhlab/; maxime.garambois@epfl.ch)",
    "Content-Type": "application/json"
}

# --- Add proxy flag in fields ---
FIELDS = "status,query,lat,lon,city,country,proxy,message"

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i : i + n]

def query_ip_batch(ip_list: List[str]) -> List[Dict[str, Any]]:
    params = {"fields": FIELDS}
    resp = requests.post(BATCH_URL, json=ip_list, params=params, headers=HEADERS, timeout=30)
    resp.raise_for_status()

    # Respect rate limits if provided
    xr = resp.headers.get("X-Rl")
    xt = resp.headers.get("X-Ttl")
    if xr is not None:
        try:
            xr_num = int(xr)
            xt_num = int(xt) if xt is not None else None
            if xr_num <= 0 and xt_num:
                print(f"[rate-limit] X-Rl=0, sleeping {xt_num}s")
                time.sleep(xt_num + 1)
        except Exception:
            pass
    return resp.json()

def geographic_centroid(coords: List[Dict[str, float]]) -> Dict[str, float]:
    if not coords:
        return {"lat": None, "lon": None}
    x = y = z = 0.0
    count = 0
    for c in coords:
        lat, lon = c.get("lat"), c.get("lon")
        if lat is None or lon is None:
            continue
        lat_r, lon_r = math.radians(lat), math.radians(lon)
        x += math.cos(lat_r) * math.cos(lon_r)
        y += math.cos(lat_r) * math.sin(lon_r)
        z += math.sin(lat_r)
        count += 1
    if count == 0:
        return {"lat": None, "lon": None}
    x /= count
    y /= count
    z /= count
    hyp = math.sqrt(x * x + y * y)
    return {
        "lat": math.degrees(math.atan2(z, hyp)),
        "lon": math.degrees(math.atan2(y, x))
    }

def main(ip_addresses: List[str], output_csv: str = "../datas/interim/IP Geolocalisation/cluster_11.csv"):
    unique_ips = list(dict.fromkeys(ip_addresses))
    print(f"Total unique IPs to query: {len(unique_ips)}")

    all_results = []
    for batch_idx, batch in enumerate(chunked(unique_ips, 100), start=1):
        print(f"Querying batch {batch_idx}: {len(batch)} IPs")
        try:
            results = query_ip_batch(batch)
        except requests.HTTPError as e:
            print(f"HTTP error for batch {batch_idx}: {e}")
            raise
        all_results.extend(results)
        time.sleep(4.2)

    df = pd.DataFrame(all_results)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"‚úÖ Saved raw results to {output_csv}")

    success_df = df[df["status"] == "success"].copy()
    coords = success_df[["lat", "lon"]].dropna().to_dict(orient="records")
    centroid = geographic_centroid(coords)
    print(f"üåç Geographic centroid: {centroid}")

    # --- Generate Folium Map ---
    print("üó∫Ô∏è  Generating map...")
    if coords:
        m = folium.Map(location=[centroid["lat"], centroid["lon"]], zoom_start=3, tiles="cartodb positron")

        for _, row in success_df.iterrows():
            lat, lon = row["lat"], row["lon"]
            proxy = row.get("proxy", False)
            color = "red" if proxy else "blue"
            tooltip = f"IP: {row['query']}<br>{row['city']}, {row['country']}<br>Proxy: {proxy}"
            folium.CircleMarker(
                location=[lat, lon],
                radius=5,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=0.7,
                tooltip=tooltip
            ).add_to(m)

        # Add centroid marker
        folium.Marker(
            location=[centroid["lat"], centroid["lon"]],
            icon=folium.Icon(color="green", icon="crosshairs"),
            tooltip="Geographic Centroid"
        ).add_to(m)

        m.save("../plots/IP Geolocalisation/cluster_11_locations.html")
        print("‚úÖ Saved interactive map")

    else:
        print("‚ö†Ô∏è No successful geolocations to map.")

    return df, centroid

# --- Example usage ---
if __name__ == "__main__":
    res = ['130.83.244.129',
         '141.154.121.149',
         '141.154.252.174',
         '141.154.30.120',
         '141.154.47.157',
         '151.203.109.139',
         '151.203.119.117',
         '151.203.123.189',
         '151.203.20.54',
         '151.203.251.199',
         '151.203.253.194',
         '151.203.39.9',
         '151.203.45.81',
         '151.203.50.144',
         '192.139.27.18',
         '194.150.174.4',
         '68.160.152.109',
         '68.160.181.69',
         '81.213.0.74',
         '81.213.2.7',
         '83.237.241.50',
         '85.96.11.162']
    df, centroid = main(res)

Total unique IPs to query: 22
Querying batch 1: 22 IPs
‚úÖ Saved raw results to ../datas/interim/IP Geolocalisation/cluster_11.csv
üåç Geographic centroid: {'lat': 53.66946509700055, 'lon': -53.24380888480387}
üó∫Ô∏è  Generating map...
‚úÖ Saved interactive map as ip_locations_map.html


In [None]:
#!/usr/bin/env python3
"""
ip_locations_two_groups_map.py

- Query two IP lists (control + test)
- Use ip-api.com batch endpoint (max 100 IPs/request)
- Compute geographic centroid per group
- Plot both on the same Folium map
"""

import math
import folium
from typing import List, Dict, Any

BATCH_URL = "http://ip-api.com/batch"
HEADERS = {
    "User-Agent": "DH_Project/1.0 (maxime.garambois@epfl.ch)",
    "Content-Type": "application/json"
}

FIELDS = "status,query,lat,lon,city,country,proxy,message"

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i : i + n]

def query_ip_batch(ip_list: List[str]) -> List[Dict[str, Any]]:
    params = {"fields": FIELDS}
    resp = requests.post(BATCH_URL, json=ip_list, params=params, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    xr = resp.headers.get("X-Rl")
    xt = resp.headers.get("X-Ttl")
    if xr is not None:
        try:
            xr_num = int(xr)
            xt_num = int(xt) if xt is not None else None
            if xr_num <= 0 and xt_num:
                print(f"[rate-limit] X-Rl=0, sleeping {xt_num}s")
                time.sleep(xt_num + 1)
        except Exception:
            pass
    return resp.json()

def geographic_centroid(coords: List[Dict[str, float]]) -> Dict[str, float]:
    if not coords:
        return {"lat": None, "lon": None}
    x = y = z = 0.0
    count = 0
    for c in coords:
        lat, lon = c.get("lat"), c.get("lon")
        if lat is None or lon is None:
            continue
        lat_r, lon_r = math.radians(lat), math.radians(lon)
        x += math.cos(lat_r) * math.cos(lon_r)
        y += math.cos(lat_r) * math.sin(lon_r)
        z += math.sin(lat_r)
        count += 1
    if count == 0:
        return {"lat": None, "lon": None}
    x /= count
    y /= count
    z /= count
    hyp = math.sqrt(x * x + y * y)
    return {
        "lat": math.degrees(math.atan2(z, hyp)),
        "lon": math.degrees(math.atan2(y, x))
    }

def query_ip_group(ip_list: List[str], group_name: str) -> pd.DataFrame:
    unique_ips = list(dict.fromkeys(ip_list))
    print(f"[{group_name}] Total unique IPs: {len(unique_ips)}")

    all_results = []
    for batch_idx, batch in enumerate(chunked(unique_ips, 100), start=1):
        print(f"[{group_name}] Querying batch {batch_idx} ({len(batch)} IPs)")
        try:
            results = query_ip_batch(batch)
        except requests.HTTPError as e:
            print(f"[{group_name}] HTTP error for batch {batch_idx}: {e}")
            raise
        for r in results:
            r["group"] = group_name
        all_results.extend(results)
        time.sleep(4.2)

    df = pd.DataFrame(all_results)
    return df

def main(control_ips: List[str], test_ips: List[str]):
    df_control = query_ip_group(control_ips, "control")
    df_test = query_ip_group(test_ips, "test")

    df_all = pd.concat([df_control, df_test], ignore_index=True)
    df_all.to_csv("ip_api_results_two_groups.csv", index=False, encoding="utf-8")
    print("‚úÖ Saved combined results to ip_api_results_two_groups.csv")

    success_df = df_all[df_all["status"] == "success"].copy()

    # --- Compute centroids per group ---
    centroids = {}
    for group in ["control", "test"]:
        coords = success_df[success_df["group"] == group][["lat", "lon"]].dropna().to_dict(orient="records")
        centroids[group] = geographic_centroid(coords)
        print(f"üåç {group.capitalize()} centroid: {centroids[group]}")

    # --- Generate Folium Map ---
    if not success_df.empty:
        first_group = next(iter(centroids))
        m = folium.Map(
            location=[centroids[first_group]["lat"], centroids[first_group]["lon"]],
            zoom_start=3,
            tiles="cartodb positron"
        )

        color_map = {"control": "blue", "test": "red"}

        for _, row in success_df.iterrows():
            lat, lon = row["lat"], row["lon"]
            group = row["group"]
            color = color_map.get(group, "gray")
            tooltip = f"[{group}] IP: {row['query']}<br>{row['city']}, {row['country']}<br>Proxy: {row.get('proxy', False)}"
            folium.CircleMarker(
                location=[lat, lon],
                radius=5,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=0.7,
                tooltip=tooltip
            ).add_to(m)

        # Add centroid markers
        for group, centroid in centroids.items():
            folium.Marker(
                location=[centroid["lat"], centroid["lon"]],
                icon=folium.Icon(color="green" if group == "control" else "purple", icon="crosshairs"),
                tooltip=f"{group.capitalize()} centroid"
            ).add_to(m)

        m.save("../plots/All Users Analysis/ip_locations_two_groups_map.html")
        print("‚úÖ Saved map: ip_locations_two_groups_map.html")
    else:
        print("‚ö†Ô∏è No successful geolocations to map.")

    return df_all, centroids


df_all, centroids = main(unique_non_weap_ip_adresses, unique_weap_ip_adresses)