# Data Collection
The code in this file collects quality information about wikipedia articles about U.S. cities using ORES. The data is then saved into a file called 'wp_scored_city_articles_by_state.csv'. Much of the code here is adapted from [here](https://drive.google.com/file/d/17C9xsmR9U3lJeD52UTbAedlHDetwYsxs/view) under the creative commons license.

## Imports

In [49]:
 
import json, time, urllib.parse
import pandas as pd
import requests
from collections import Counter

## Data Constants
This cell is declaring constants such as latency, request headers, etc.

In [154]:
#########
#
#    CONSTANTS

#    The current LiftWing ORES API endpoint and prediction model
#


#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "rev_ids":      "",       # this request requires a revision id
}

ACCESS_TOKEN = ""

## Retrieval Functions
This cell declares key functions for retrieving revision id data from wikipedia and running the ORES ML API for quality assessment of the articles.

In [251]:


#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    # if article_revid:
    #     request_data['rev_ids'] = "|".join(article_revid)
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not article_revid:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    try:
       
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
       
        response = requests.get("https://ores.wikimedia.org/v3/scores/enwiki/?models=draftquality|wp10&revids="+"|".join(str(item) for item in article_revid), headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


def get_revision_id(titles):
    if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": titles,
        "prop": "info",
        "inprop": "url|talkid"
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    print(DATA)

    PAGES = DATA["query"]["pages"]
    rev_dict = {}
    for k, v in PAGES.items():
        rev_dict[v["title"]] = v["lastrevid"]
    return rev_dict



## Data Retrieval 
The following cell retrieves the data using the above functions.

In [250]:

df = pd.read_csv("us_cities_by_state_SEPT.2023.csv")

rev_dict = {}
articles = []
revisions = []
scores = []

city_pages = df['page_title'].to_list()
for i in range(0, len(city_pages), 10):
    print('Getting batch', i//10)
    batch = city_pages[i:i+10]
    titles = "|".join(str(b) for b in batch)
    new_dict = get_revision_id(titles)
    rev_dict.update(new_dict)
    score = request_ores_score_per_article(article_revid=list(new_dict.values()),
                                       email_address="cjault@uw.edu",
                                       access_token=ACCESS_TOKEN)
    articles += batch
    revisions += new_dict.values()
    for k in new_dict.keys():
        scores.append(score["enwiki"]["scores"][str(new_dict[k])]["wp10"]["score"]['prediction'])


Getting batch 0
{'batchcomplete': '', 'query': {'pages': {'104730': {'pageid': 104730, 'ns': 0, 'title': 'Abbeville, Alabama', 'contentmodel': 'wikitext', 'pagelanguage': 'en', 'pagelanguagehtmlcode': 'en', 'pagelanguagedir': 'ltr', 'touched': '2023-10-10T22:35:37Z', 'lastrevid': 1171163550, 'length': 24706, 'talkid': 281244, 'fullurl': 'https://en.wikipedia.org/wiki/Abbeville,_Alabama', 'editurl': 'https://en.wikipedia.org/w/index.php?title=Abbeville,_Alabama&action=edit', 'canonicalurl': 'https://en.wikipedia.org/wiki/Abbeville,_Alabama'}, '104761': {'pageid': 104761, 'ns': 0, 'title': 'Adamsville, Alabama', 'contentmodel': 'wikitext', 'pagelanguage': 'en', 'pagelanguagehtmlcode': 'en', 'pagelanguagedir': 'ltr', 'touched': '2023-10-10T22:35:37Z', 'lastrevid': 1177621427, 'length': 18040, 'talkid': 281272, 'fullurl': 'https://en.wikipedia.org/wiki/Adamsville,_Alabama', 'editurl': 'https://en.wikipedia.org/w/index.php?title=Adamsville,_Alabama&action=edit', 'canonicalurl': 'https://en.

KeyboardInterrupt: 

## Unwanted Articles
The cell below outlines unwanted articles that are not cities. These were calculated from finding duplicate article names and revision id's. This means that the crawler that created the file of cities hit these points multiple times which indicates it is a region or at least not a city.

In [None]:
# Articles we do not want to target
blacklist_articles = ['County (United States)', 'Population', 'Square mile', 'Federal Information Processing Standards', 'American National Standards Institute', 'Geographic Names Information System', 'Grand Divisions of Tennessee', '2020 United States census', '2010 United States census', 'Accomack County, Virginia', 'Albemarle County, Virginia', 'Alleghany County, Virginia', 'Amelia County, Virginia', 'Amherst County, Virginia', 'Appomattox County, Virginia', 'Arlington County, Virginia', 'Augusta County, Virginia', 'Bath County, Virginia', 'Bedford County, Virginia', 'Bland County, Virginia', 'Botetourt County, Virginia', 'Brunswick County, Virginia', 'Buchanan County, Virginia', 'Buckingham County, Virginia', 'Campbell County, Virginia', 'Caroline County, Virginia', 'Carroll County, Virginia', 'Charles City County, Virginia', 'Charlotte County, Virginia', 'Chesterfield County, Virginia', 'Clarke County, Virginia', 'Craig County, Virginia', 'Culpeper County, Virginia', 'Cumberland County, Virginia', 'Dickenson County, Virginia', 'Dinwiddie County, Virginia', 'Essex County, Virginia', 'Fairfax County, Virginia', 'Fauquier County, Virginia', 'Floyd County, Virginia', 'Fluvanna County, Virginia', 'Franklin County, Virginia', 'Frederick County, Virginia', 'Giles County, Virginia', 'Gloucester County, Virginia', 'Goochland County, Virginia', 'Grayson County, Virginia', 'Greene County, Virginia', 'Greensville County, Virginia', 'Halifax County, Virginia', 'Hanover County, Virginia', 'Henrico County, Virginia', 'Henry County, Virginia', 'Highland County, Virginia', 'Isle of Wight County, Virginia', 'James City County, Virginia', 'King and Queen County, Virginia', 'King George County, Virginia', 'King William County, Virginia', 'Lancaster County, Virginia', 'Lee County, Virginia', 'Loudoun County, Virginia', 'Louisa County, Virginia', 'Lunenburg County, Virginia', 'Madison County, Virginia', 'Mathews County, Virginia', 'Mecklenburg County, Virginia', 'Middlesex County, Virginia', 'Montgomery County, Virginia', 'Nelson County, Virginia', 'New Kent County, Virginia', 'Northampton County, Virginia', 'Northumberland County, Virginia', 'Nottoway County, Virginia', 'Orange County, Virginia', 'Page County, Virginia', 'Patrick County, Virginia', 'Pittsylvania County, Virginia', 'Powhatan County, Virginia', 'Prince Edward County, Virginia', 'Prince George County, Virginia', 'Prince William County, Virginia', 'Pulaski County, Virginia', 'Rappahannock County, Virginia', 'Richmond County, Virginia', 'Roanoke County, Virginia', 'Rockbridge County, Virginia', 'Rockingham County, Virginia', 'Russell County, Virginia', 'Scott County, Virginia', 'Shenandoah County, Virginia', 'Smyth County, Virginia', 'Southampton County, Virginia', 'Spotsylvania County, Virginia', 'Stafford County, Virginia', 'Surry County, Virginia', 'Sussex County, Virginia', 'Tazewell County, Virginia', 'Warren County, Virginia', 'Washington County, Virginia', 'Westmoreland County, Virginia', 'Wise County, Virginia', 'Wythe County, Virginia', 'York County, Virginia', 'Alexandria, Virginia', 'Bristol, Virginia', 'Buena Vista, Virginia', 'Charlottesville, Virginia', 'Chesapeake, Virginia', 'Colonial Heights, Virginia', 'Covington, Virginia', 'Danville, Virginia', 'Emporia, Virginia', 'Fairfax, Virginia', 'Falls Church, Virginia', 'Franklin, Virginia', 'Fredericksburg, Virginia', 'Galax, Virginia', 'Hampton, Virginia', 'Harrisonburg, Virginia', 'Hopewell, Virginia', 'Lexington, Virginia', 'Lynchburg, Virginia', 'Manassas, Virginia', 'Manassas Park, Virginia', 'Martinsville, Virginia', 'Newport News, Virginia', 'Norfolk, Virginia', 'Norton, Virginia', 'Petersburg, Virginia', 'Poquoson, Virginia', 'Portsmouth, Virginia', 'Radford, Virginia', 'Richmond, Virginia', 'Roanoke, Virginia', 'Salem, Virginia', 'Staunton, Virginia', 'Suffolk, Virginia', 'Virginia Beach, Virginia', 'Waynesboro, Virginia', 'Williamsburg, Virginia', 'Winchester, Virginia']
blacklist_rev = [1178988667, 1179591354, 1179747974, 1172478013, 1175882108, 1179447400, 1175359799, 1179393707, 1179894823, 1177127699, 1174935575, 1177381277, 1176261398, 1172759600, 1173914155, 1180408287, 1179806976, 1179190289, 1175701448, 1172622410, 1178411699, 1173345759, 1161252887, 1179396671, 1177668957, 1175705267, 1159576648, 1176143424, 1175706369, 1166068523, 1159576984, 1172873564, 1180125397, 1175706773, 1159577209, 1179891099, 1175711008, 1179705918, 1177286317, 1175731895, 1177061279, 1178238427, 1174839386, 1163056257, 1179710658, 1173720612, 1168636825, 1176840755, 1180298399, 1166092103, 1172882656, 1178174208, 1174411303, 1179512268, 1172934669, 1176008529, 1177803019, 1171044243, 1176205466, 1175410763, 1178691008, 1176589473, 1172752518, 1159606947, 1159607022, 1159686921, 1159687045, 1177439291, 1152594631, 1178415845, 1171659522, 1166671671, 1172345198, 1160205180, 1171107080, 1172540443, 1172200578, 1170250578, 1159751766, 1173346044, 1173169934, 1176303629, 1159752330, 1180119731, 1166940769, 1175386938, 1168629971, 1172930083, 1159752983, 1172621888, 1174079675, 1179713060, 1153510493, 1177202263, 1175370881, 1174236915, 1177426495, 1172617911, 1173161788, 1175561145, 1179981187, 1178752866, 1169693796, 1176624311, 1180367758, 1172656361, 1177346134, 1179344628, 1177498283, 1169222060, 1169330302, 1176688108, 1174387012, 1175067625, 1177312743, 1170363182, 1178256264, 1173219237, 1178925952, 1178248304, 1173345591, 1176128309, 1180195119, 1178149428, 1177308264, 1179945503, 1177933906, 1180350896, 1160458417, 1177840438, 1180438824, 1179580801, 1174929714, 1180350446, 1179983043, 1173991892, 1177187603, 1180409289, 1179974819, 1167253545, 1179596070, 1179433809]
print(blacklist_articles)

## Data Cleaning
The following cell removes unwanted articles from the list and cleans the data before combining the other data files and saving the final results.

In [213]:

final_revisions = []
final_articles = []
final_scores = []
for i in range(len(revisions)):
    if revisions[i] not in blacklist_rev:
        final_revisions.append(revisions[i])
        final_scores.append(scores[i])
for i in range(len(articles)):
    if articles[i] not in blacklist_articles:
        final_articles.append(articles[i])

df3 = pd.DataFrame({'article_title': final_articles, 'revision_id': final_revisions, 'article_quality': final_scores})

df_regions = pd.read_csv('US States by Region - US Census Bureau - Sheet1.csv')
df_regions = df_regions.rename(columns={'STATE': 'state', 'REGION':'regional_division'})
df_state = pd.read_csv('us_cities_by_state_SEPT.2023.csv')
df_state = df_state.rename(columns={'page_title': 'article_title'})
df_scores = df3.copy()
df_state['state'] = df_state['state'].replace('Georgia_(U.S._state)', 'Georgia')
df_state['state'] = df_state['state'].str.replace('_', ' ')


df_merged = pd.merge(df_scores, df_state[['state', 'article_title']], on='article_title', how='left')
df_merged = pd.merge(df_merged, df_regions[['state', 'regional_division']], on='state', how='left')

df_pop = pd.read_csv('population.csv')

# Remove commas and convert the column to integers
df_pop['Population 2022'] = df_pop['Population 2022'].str.replace(',', '').astype(int)
df_pop = df_pop.rename(columns={'State': 'state', 'Population 2022':'population'})

df_final = pd.merge(df_merged, df_pop[['state', 'population']], on='state', how='left')
df_final.to_csv('wp_scored_city_articles_by_state.csv',index=False)
df_final.head()


18004
18004
18006
['East Gillespie, Illinois', 'East Hazel Crest, Illinois', 'East Moline, Illinois', 'Easton, Illinois', 'East Peoria, Illinois']
['Wamsutter, Wyoming', 'Wheatland, Wyoming', 'Worland, Wyoming', 'Wright, Wyoming', 'Yoder, Wyoming']
[1169591845, 1176370621, 1166347917, 1166334449, 1171182284]
['GA', 'GA', 'GA', 'GA', 'C']
[16942, 17075]
['County (United States)', 'Population', 'Square mile', 'Federal Information Processing Standards', 'American National Standards Institute', 'Geographic Names Information System', 'Grand Divisions of Tennessee', '2020 United States census', '2010 United States census', 'Accomack County, Virginia', 'Albemarle County, Virginia', 'Alleghany County, Virginia', 'Amelia County, Virginia', 'Amherst County, Virginia', 'Appomattox County, Virginia', 'Arlington County, Virginia', 'Augusta County, Virginia', 'Bath County, Virginia', 'Bedford County, Virginia', 'Bland County, Virginia', 'Botetourt County, Virginia', 'Brunswick County, Virginia', 'Bu