In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests
import zipfile
from io import BytesIO

# Create a directory for the data
data_dir = "buzzfeed_partisan_data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Function to download files directly instead of using git
def download_github_file(repo_owner, repo_name, path, save_path):
    """Download a specific file from GitHub"""
    url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/master/{path}"
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
        return True
    else:
        print(f"Failed to download: {url}")
        return False

# Download the main CSV files
files_to_download = [
    "data/all-partisan-sites.csv",
    "data/pages-info.csv",
    "data/domaintools-whois-results.csv"
]

for file_path in files_to_download:
    save_path = os.path.join(data_dir, os.path.basename(file_path))
    download_github_file(
        "BuzzFeedNews", 
        "2017-08-partisan-sites-and-facebook-pages", 
        file_path, 
        save_path
    )

# Load the partisan sites data
partisan_sites_path = os.path.join(data_dir, "all-partisan-sites.csv")
if os.path.exists(partisan_sites_path):
    partisan_sites = pd.read_csv(partisan_sites_path)
    
    # Look at the structure
    print("Dataset shape:", partisan_sites.shape)
    print("\nColumns:", partisan_sites.columns.tolist())
    print("\nFirst few rows:")
    print(partisan_sites.head())
    
    # Check for missing values
    print("\nMissing values per column:")
    print(partisan_sites.isnull().sum())
    
    # Explore partisan distribution
    if 'partisanship' in partisan_sites.columns:
        print("\nPartisan distribution:")
        print(partisan_sites['partisanship'].value_counts())
    elif 'partisan_code' in partisan_sites.columns:
        print("\nPartisan distribution:")
        print(partisan_sites['partisan_code'].value_counts())
else:
    print("Failed to download the partisan sites dataset")

# Load Facebook page info
pages_info_path = os.path.join(data_dir, "pages-info.csv")
if os.path.exists(pages_info_path):
    pages_info = pd.read_csv(pages_info_path)
    print("\nFacebook Pages Dataset shape:", pages_info.shape)
    print("\nFacebook Pages columns:", pages_info.columns.tolist())
    print(pages_info.head())
else:
    print("Failed to download the pages info dataset")

Downloaded: buzzfeed_partisan_data/all-partisan-sites.csv
Downloaded: buzzfeed_partisan_data/pages-info.csv
Downloaded: buzzfeed_partisan_data/domaintools-whois-results.csv
Dataset shape: (677, 5)

Columns: ['site', 'political_category', 'fb_id', 'unavailable_id', 'macedonian']

First few rows:
                                site political_category             fb_id  \
0                100percentfedup.com              right   311190048935167   
1                21stcenturywire.com               left   182032255155419   
2                     24dailynew.com              right   515629708825640   
3                       24usnews.com              right  1430973860248840   
4  4threvolutionarywar.wordpress.com               left               NaN   

   unavailable_id  macedonian  
0             NaN           0  
1             NaN           0  
2             NaN           1  
3             NaN           1  
4             NaN           0  

Missing values per column:
site                 

In [16]:
# Examine distribution of political categories
print("\nPolitical category distribution:")
political_dist = partisan_sites['political_category'].value_counts()
print(political_dist)

# Visualize the distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='political_category', data=partisan_sites)
plt.title('Distribution of Partisan Sites by Political Category')
plt.savefig(os.path.join(data_dir, 'political_distribution.png'))
plt.close()

# Check Macedonian sites
print("\nMacedonian sites distribution:")
macedonian_dist = partisan_sites['macedonian'].value_counts()
print(macedonian_dist)
print("\nMacedonian sites by political category:")
print(pd.crosstab(partisan_sites['political_category'], partisan_sites['macedonian']))


Political category distribution:
political_category
right    499
left     178
Name: count, dtype: int64

Macedonian sites distribution:
macedonian
0    596
1     81
Name: count, dtype: int64

Macedonian sites by political category:
macedonian            0   1
political_category         
left                176   2
right               420  79


In [17]:
# Check data types
print("fb_id dtype:", partisan_sites['fb_id'].dtype)
print("page_id dtype:", pages_info['page_id'].dtype)

# Convert both columns to strings for proper merging
partisan_sites['fb_id'] = partisan_sites['fb_id'].astype(str)
pages_info['page_id'] = pages_info['page_id'].astype(str)

# Now merge will work
merged_data = pd.merge(
    partisan_sites, 
    pages_info,
    left_on='fb_id', 
    right_on='page_id',
    how='inner'
)

print(f"\nNumber of sites with matched Facebook pages: {len(merged_data)}")

# Compare fan counts by political category
plt.figure(figsize=(10, 6))
sns.boxplot(x='political_category', y='fan_count', data=merged_data)
plt.yscale('log')
plt.title('Facebook Fan Count by Political Category')
plt.savefig(os.path.join(data_dir, 'fan_count_by_category.png'))
plt.close()

fb_id dtype: object
page_id dtype: int64

Number of sites with matched Facebook pages: 490


In [18]:
# Analyze domain registration data
whois_path = os.path.join(data_dir, "domaintools-whois-results.csv")
if os.path.exists(whois_path):
    whois_data = pd.read_csv(whois_path)
    print("\nWHOIS data shape:", whois_data.shape)
    print("\nWHOIS columns:", whois_data.columns.tolist())
    
    # Check sample data
    print("\nSample WHOIS data:")
    print(whois_data.head())
    
    # Check registration dates if available
    if 'create_date' in whois_data.columns:
        # Convert to datetime
        whois_data['create_date'] = pd.to_datetime(whois_data['create_date'], errors='coerce')
        
        # Extract year and month
        whois_data['reg_year'] = whois_data['create_date'].dt.year
        
        # Plot registration years
        plt.figure(figsize=(12, 6))
        whois_data['reg_year'].value_counts().sort_index().plot(kind='bar')
        plt.title('Domain Registration Years')
        plt.xlabel('Year')
        plt.ylabel('Number of Domains')
        plt.savefig(os.path.join(data_dir, 'domain_reg_years.png'))
        plt.close()


WHOIS data shape: (663, 70)

WHOIS columns: ['domain', 'whois url', 'admin contact name', 'admin contact org', 'admin contact street', 'admin contact city', 'admin contact state', 'admin contact postal', 'admin contact country', 'admin contact phone', 'admin contact fax', 'admin contact email 1', 'admin contact email 2', 'admin contact email 3', 'billing contact name', 'billing contact org', 'billing contact street', 'billing contact city', 'billing contact state', 'billing contact postal', 'billing contact country', 'billing contact phone', 'billing contact fax', 'billing contact email 1', 'billing contact email 2', 'billing contact email 3', 'registrant contact name', 'registrant contact org', 'registrant contact street', 'registrant contact city', 'registrant contact state', 'registrant contact postal', 'registrant contact country', 'registrant contact phone', 'registrant contact fax', 'registrant contact email 1', 'registrant contact email 2', 'registrant contact email 3', 'techni

In [19]:
# Check for any text content in our datasets
if 'about' in pages_info.columns:
    # Filter out missing about descriptions
    about_text = pages_info['about'].dropna()
    
    # Basic text statistics
    text_lengths = about_text.str.len()
    word_counts = about_text.str.split().str.len()
    
    print("\nAbout text statistics:")
    print(f"Number of pages with descriptions: {len(about_text)}")
    print(f"Average length: {text_lengths.mean():.1f} characters")
    print(f"Average word count: {word_counts.mean():.1f} words")
    
    # Word frequency analysis
    from collections import Counter
    import re
    
    # Combine all text
    all_text = ' '.join(about_text)
    
    # Simple tokenization (you might want to use NLTK or spaCy for better tokenization)
    words = re.findall(r'\b\w+\b', all_text.lower())
    
    # Count word frequencies
    word_freq = Counter(words)
    
    # Top 20 most common words
    print("\nTop 20 most common words in page descriptions:")
    for word, count in word_freq.most_common(20):
        print(f"{word}: {count}")


About text statistics:
Number of pages with descriptions: 426
Average length: 94.4 characters
Average word count: 13.9 words

Top 20 most common words in page descriptions:
the: 343
and: 241
news: 165
to: 152
of: 148
a: 118
com: 111
is: 111
for: 82
on: 70
http: 68
we: 67
in: 55
www: 55
conservative: 52
media: 51
political: 48
our: 48
s: 48
that: 45


In [20]:
# Check for domains with multiple Facebook pages
duplicate_sites = partisan_sites['site'].value_counts()
sites_with_multiple_pages = duplicate_sites[duplicate_sites > 1]

if len(sites_with_multiple_pages) > 0:
    print("\nSites with multiple Facebook pages:")
    print(sites_with_multiple_pages)
    
    # Example of a site with multiple pages
    example_site = sites_with_multiple_pages.index[0]
    print(f"\nExample - Facebook pages for {example_site}:")
    print(partisan_sites[partisan_sites['site'] == example_site])

# Summary statistics for exploration
print("\nExploration Summary:")
print(f"Total partisan sites: {len(partisan_sites)}")
print(f"  - With Facebook IDs: {partisan_sites['fb_id'].notna().sum()}")
print(f"  - Macedonian sites: {partisan_sites['macedonian'].sum()}")
print(f"Total Facebook pages: {len(pages_info)}")
print(f"Matched sites with pages: {len(merged_data)}")


Sites with multiple Facebook pages:
site
topsecretinfodump.com    2
cscmediagroupus.com      2
usanewshome.com          2
donaldtrumpnews.co       2
westernjournalism.com    2
thetruthdivision.com     2
cosmo-politics.com       2
redflagnews.com          2
analogpolitics.com       2
bluedotdaily.com         2
Name: count, dtype: int64

Example - Facebook pages for topsecretinfodump.com:
                      site political_category            fb_id  \
568  topsecretinfodump.com              right  507165509486196   
569  topsecretinfodump.com              right  592074077639509   

     unavailable_id  macedonian  
568             NaN           0  
569             NaN           0  

Exploration Summary:
Total partisan sites: 677
  - With Facebook IDs: 677
  - Macedonian sites: 81
Total Facebook pages: 452
Matched sites with pages: 490
