# Exploration of Craigslist

# Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint

# Obtain web data, including handling JSON
import requests

# Library for Craigslist
import craigslist

# Craigslist APIs

In [2]:
# Craigslist
# API description - https://www.craigslist.org/about/reference
# API endpoint - http://reference.craigslist.org
base_url = "http://reference.craigslist.org/"

# Areas: The list of areas and subareas for which a Craigslist site is established
areas_request = "Areas"

# Categories: The list of categories associated with entries on Craigslist
categories_request = "Categories"


## Explore Craigslist Areas

In [3]:
base_url + areas_request

'http://reference.craigslist.org/Areas'

In [4]:
# Get the list of Craiglist areas and subareas
response = requests.get(base_url + areas_request)

In [5]:
# Confirm that the web response code is OK
assert response.status_code == 200, "Return code NOK: {response.status_code}"

In [6]:
# Put the JSON results into a dictionary
area_list = response.json()
# pprint(area_list)

In [7]:
# Create dataframe of areas
areas_df = pd.DataFrame( area_list )

# Exclude the dictionaries in SubAreas since they will form a separate dataframe
areas_df.drop(columns=['SubAreas'], inplace=True)
areas_df

Unnamed: 0,Abbreviation,AreaID,Country,Description,Hostname,Latitude,Longitude,Region,ShortDescription,Timezone
0,sfo,1,US,SF bay area,sfbay,37.500000,-122.250000,CA,SF bay area,America/Los_Angeles
1,sea,2,US,seattle-tacoma,seattle,47.606400,-122.331001,WA,seattle,America/Los_Angeles
2,nyc,3,US,new york city,newyork,40.714199,-74.006401,NY,new york,America/New_York
3,bos,4,US,boston,boston,42.358299,-71.060303,MA,boston,America/New_York
4,lax,7,US,los angeles,losangeles,34.052200,-118.242996,CA,los angeles,America/Los_Angeles
...,...,...,...,...,...,...,...,...,...,...
702,hnf,709,US,hanford-corcoran,hanford,36.327400,-119.646004,CA,hanford,America/Los_Angeles
703,smx,710,US,"santa maria, CA",santamaria,34.963799,-120.433296,CA,santa maria,America/Los_Angeles
704,okv,711,US,"winchester, VA",winchester,39.178299,-78.166603,VA,winchester,America/New_York
705,vaw,712,US,southwest VA,swva,36.892803,-82.084351,VA,southwest VA,America/New_York


In [8]:
# Create a dataframe of subareas that are correlated to Area IDs
subareas_df = pd.DataFrame()
i=0
for a in area_list:
    d_text  = f"Area [{a['AreaID']}]: {a['Description']} => "

    # Check for subareas
    if 'SubAreas' in a.keys():
          
        # For this specific area, create a temp dataframe of subareas
        # based upon the dictionary specified in the SubAreas column
        temp_df = pd.Series(a['SubAreas']).apply(pd.Series)

        # Add the AreaID for this area to each of the subareas in the temp dataframe
        temp_df['AreaID'] = a['AreaID']

        # Append this temp dataframe to the dataframe of subareas
        subareas_df = subareas_df.append( temp_df, ignore_index=True)
        
        # Add a note to the debug text
        d_text += f"{len(temp_df)} Subareas processed"
        
    else:
        # There are no subareas for this specific area
        d_text += f"No Subareas"

    # print(d_text)

In [9]:
subareas_df

Unnamed: 0,Abbreviation,Description,ShortDescription,SubAreaID,AreaID
0,sfc,city of san francisco,san francisco,1,1
1,sby,south bay area,south bay,2,1
2,eby,east bay area,east bay,3,1
3,pen,peninsula,peninsula,4,1
4,nby,north bay / marin,north bay,5,1
...,...,...,...,...,...
98,psc,pasco co,pasco co,3,37
99,pnl,pinellas co,pinellas co,4,37
100,lee,lee county,lee county,1,125
101,chl,charlotte county,charlotte co,2,125


In [10]:
area_info_df = areas_df.merge(subareas_df, on='AreaID', how='left', suffixes=('_Area', '_SubArea') )

In [11]:
area_info_df = area_info_df[ [
        'AreaID', 'Abbreviation_Area', 'ShortDescription_Area', 'Description_Area',
        'Hostname', 'Region',  'Country', 'Latitude', 'Longitude', 'Timezone',
        'SubAreaID', 'Abbreviation_SubArea', 'ShortDescription_SubArea', 'Description_SubArea'
        ] ]
area_info_df

Unnamed: 0,AreaID,Abbreviation_Area,ShortDescription_Area,Description_Area,Hostname,Region,Country,Latitude,Longitude,Timezone,SubAreaID,Abbreviation_SubArea,ShortDescription_SubArea,Description_SubArea
0,1,sfo,SF bay area,SF bay area,sfbay,CA,US,37.500000,-122.250000,America/Los_Angeles,1.0,sfc,san francisco,city of san francisco
1,1,sfo,SF bay area,SF bay area,sfbay,CA,US,37.500000,-122.250000,America/Los_Angeles,2.0,sby,south bay,south bay area
2,1,sfo,SF bay area,SF bay area,sfbay,CA,US,37.500000,-122.250000,America/Los_Angeles,3.0,eby,east bay,east bay area
3,1,sfo,SF bay area,SF bay area,sfbay,CA,US,37.500000,-122.250000,America/Los_Angeles,4.0,pen,peninsula,peninsula
4,1,sfo,SF bay area,SF bay area,sfbay,CA,US,37.500000,-122.250000,America/Los_Angeles,5.0,nby,north bay,north bay / marin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,709,hnf,hanford,hanford-corcoran,hanford,CA,US,36.327400,-119.646004,America/Los_Angeles,,,,
786,710,smx,santa maria,"santa maria, CA",santamaria,CA,US,34.963799,-120.433296,America/Los_Angeles,,,,
787,711,okv,winchester,"winchester, VA",winchester,VA,US,39.178299,-78.166603,America/New_York,,,,
788,712,vaw,southwest VA,southwest VA,swva,VA,US,36.892803,-82.084351,America/New_York,,,,


## Explore Craigslist Categories

In [12]:
base_url + categories_request

'http://reference.craigslist.org/Categories'

In [13]:
# Get the list of Craiglist areas and subareas
response = requests.get(base_url + categories_request)

In [14]:
# Confirm that the web response code is OK
assert response.status_code == 200, "Return code NOK: {response.status_code}"

In [15]:
# Put the JSON results into a dictionary
cat_list = response.json()
# pprint(cat_list)

In [16]:
# Create dataframe of Categories
cat_df = pd.DataFrame( cat_list )
# cat_df

In [17]:
# Create a dictionary of Category Types (based upon reference page info)
cat_type_dict = {
    'B': 'Services',
    'C': 'Community',
    'E': 'Events',
    'G': 'Gigs',
    'H': 'Housing',
    'J': 'Jobs',
    'P': 'Personals',
    'R': 'Resumes',
    'S': 'For Sale / Wanted',
    'L': 'TBD'
}

In [18]:
cat_df['Type_Definition'] = cat_df['Type'].apply(lambda x: cat_type_dict[x] if x in cat_type_dict.keys() else None)
cat_df

Unnamed: 0,Abbreviation,CategoryID,Description,Type,Type_Definition
0,apa,1,apts/housing for rent,H,Housing
1,hou,2,wanted: apts,H,Housing
2,com,3,general community,C,Community
3,biz,4,small biz ads,B,Services
4,for,5,general for sale - by owner,S,For Sale / Wanted
...,...,...,...,...,...
175,tro,205,trailers - by owner,S,For Sale / Wanted
176,trb,206,trailers - by dealer,S,For Sale / Wanted
177,cms,207,cell phone / mobile services,B,Services
178,avo,208,aviation - by owner,S,For Sale / Wanted


In [19]:
# No idea what category type = 'L' is for, so made it 'TBD'
cat_df[ cat_df['Type']=='L' ]

Unnamed: 0,Abbreviation,CategoryID,Description,Type,Type_Definition
48,cal,66,event calendar,L,TBD


# Searches on Craigslist

In [20]:
# Craigslist search: http://YOURCITY.craigslist.org/search/sss?format=rss&query=SearchString
# This returns a rss/xml file

In [21]:
# Instead, use the craigslist library to perform the search, for example:
# craigslist.search(
#     area,
#     category,
#     type_="jsonsearch",
#     get_detailed_posts=False,
#     cache=True,
#     cachedir=os.path.expanduser('~'),
#     executor=None,
#     executor_class='concurrent.futures.ThreadPoolExecutor',
#     max_workers=None,
#     get=requests_get,
#     **kwargs):

In [22]:
# Search for apartments in Chicago within 1 mile(?) of postal code 60661
search_generator = craigslist.search(area='chicago', category='apa', postal=60661, search_distance=1,
                                      get_detailed_posts=True, limit=1 )

In [23]:
i=0
p_list = []
for p in search_generator:
    p_list.append( dict( p._asdict() ) )
    i += 1
    if i>100: break

post_df = pd.DataFrame( p_list )
post_df

Unnamed: 0,id,repost_id,url,full_title,short_title,hood,num_bedrooms,sqftage,price,body_html,body_text,address,available_date
0,7007855615,,https://chicago.craigslist.org/chc/apa/d/chica...,$1677 Brand New Studio in South Loop! Lease up...,Brand New Studio in South Loop! Lease up! In-u...,Chicago - South Loop,,,1677,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 115504620Rent: 1677 / Month...,South Wells near Polk,
1,7006440536,,https://chicago.craigslist.org/chc/apa/d/chica...,$1750 / 1br - Amazing 1 bed in Fulton River w...,Amazing 1 bed in Fulton River w Great Views an...,Chicago - Fulton River District,1.0,,1750,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 9049569Rent: 1750 / MonthBe...,North Des Plaines St near Fulton,
2,7004840465,6925291786,https://chicago.craigslist.org/chc/apa/d/chica...,"$3310 / 1br - 895ft2 - Massive 1 Bed, 895 Sq ...","Massive 1 Bed, 895 Sq Feet, 1 MONTH FREE, Stun...",River North / Old Town / Gold Coast,1.0,895.0,3310,"<section id=""postingbody"">\n <br>\n<br>...",Stunning chef's kitchen; custom stone countert...,,2019-10-22
3,7007224782,,https://chicago.craigslist.org/chc/apa/d/chica...,$2250 / 2br - 1200ft2 - Luxury Lofts Availabl...,Luxury Lofts Available for rent! Contact Us NO...,Chicago,2.0,1200.0,2250,"<section id=""postingbody"">\n <br>\nRent...",Rent: $2250\nBeds: 2\nBath: 2\nAvailable Date:...,S Dearborn St.,2019-11-01
4,7009561889,,https://chicago.craigslist.org/chc/apa/d/chica...,"$2095 / 1br - 750ft2 - 1 Bd on West Grand, Ha...","1 Bd on West Grand, Hardwood Floors, Furnished...",Chicago - West Loop,1.0,750.0,2095,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 184284042Rent: 2095 / Month...,645 West Grand,2019-10-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,7012705471,,https://chicago.craigslist.org/chc/apa/d/chica...,$2595 / 1br - First class new construction,First class new construction,West Loop,1.0,,2595,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 129314294Rent: 2595 / Month...,West Madison,2019-11-03
97,7005590988,,https://chicago.craigslist.org/chc/apa/d/chica...,$2600 / 1br - 900ft2 - Fantastic 1bed/1bath+D...,Fantastic 1bed/1bath+Den in the Emerald Towers...,Chicago-West Loop,1.0,900.0,2600,"<section id=""postingbody"">\n <br>\nUnit...",Unit Features:\n\nONE BEDROOM + DEN IN W/ AMAZ...,125 S. Green Street,2019-12-01
98,7005915714,6979828575,https://chicago.craigslist.org/chc/apa/d/chica...,$3590 / 2br - 1200ft2 - Furnished Sleek West ...,"Furnished Sleek West Loop w/ Gym, Pool, W/D, n...",West Loop/ Chicago,2.0,1200.0,3590,"<section id=""postingbody"">\n <br>\n<br>...","12+ month lease: $3,990* / mo\n6 to 12 month l...",180 N Jefferson Street,2019-12-01
99,7012695921,,https://chicago.craigslist.org/chc/apa/d/chica...,$2280 / 1br - 800ft2 - Luxury West Loop 1 br ...,Luxury West Loop 1 br with parking included,West Loop,1.0,800.0,2280,"<section id=""postingbody"">\n <br>\n<br>...","Unit features soaring 9' ceilings, south-facin...",Green near Monroe,2019-11-15


In [24]:
post_df

Unnamed: 0,id,repost_id,url,full_title,short_title,hood,num_bedrooms,sqftage,price,body_html,body_text,address,available_date
0,7007855615,,https://chicago.craigslist.org/chc/apa/d/chica...,$1677 Brand New Studio in South Loop! Lease up...,Brand New Studio in South Loop! Lease up! In-u...,Chicago - South Loop,,,1677,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 115504620Rent: 1677 / Month...,South Wells near Polk,
1,7006440536,,https://chicago.craigslist.org/chc/apa/d/chica...,$1750 / 1br - Amazing 1 bed in Fulton River w...,Amazing 1 bed in Fulton River w Great Views an...,Chicago - Fulton River District,1.0,,1750,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 9049569Rent: 1750 / MonthBe...,North Des Plaines St near Fulton,
2,7004840465,6925291786,https://chicago.craigslist.org/chc/apa/d/chica...,"$3310 / 1br - 895ft2 - Massive 1 Bed, 895 Sq ...","Massive 1 Bed, 895 Sq Feet, 1 MONTH FREE, Stun...",River North / Old Town / Gold Coast,1.0,895.0,3310,"<section id=""postingbody"">\n <br>\n<br>...",Stunning chef's kitchen; custom stone countert...,,2019-10-22
3,7007224782,,https://chicago.craigslist.org/chc/apa/d/chica...,$2250 / 2br - 1200ft2 - Luxury Lofts Availabl...,Luxury Lofts Available for rent! Contact Us NO...,Chicago,2.0,1200.0,2250,"<section id=""postingbody"">\n <br>\nRent...",Rent: $2250\nBeds: 2\nBath: 2\nAvailable Date:...,S Dearborn St.,2019-11-01
4,7009561889,,https://chicago.craigslist.org/chc/apa/d/chica...,"$2095 / 1br - 750ft2 - 1 Bd on West Grand, Ha...","1 Bd on West Grand, Hardwood Floors, Furnished...",Chicago - West Loop,1.0,750.0,2095,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 184284042Rent: 2095 / Month...,645 West Grand,2019-10-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,7012705471,,https://chicago.craigslist.org/chc/apa/d/chica...,$2595 / 1br - First class new construction,First class new construction,West Loop,1.0,,2595,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 129314294Rent: 2595 / Month...,West Madison,2019-11-03
97,7005590988,,https://chicago.craigslist.org/chc/apa/d/chica...,$2600 / 1br - 900ft2 - Fantastic 1bed/1bath+D...,Fantastic 1bed/1bath+Den in the Emerald Towers...,Chicago-West Loop,1.0,900.0,2600,"<section id=""postingbody"">\n <br>\nUnit...",Unit Features:\n\nONE BEDROOM + DEN IN W/ AMAZ...,125 S. Green Street,2019-12-01
98,7005915714,6979828575,https://chicago.craigslist.org/chc/apa/d/chica...,$3590 / 2br - 1200ft2 - Furnished Sleek West ...,"Furnished Sleek West Loop w/ Gym, Pool, W/D, n...",West Loop/ Chicago,2.0,1200.0,3590,"<section id=""postingbody"">\n <br>\n<br>...","12+ month lease: $3,990* / mo\n6 to 12 month l...",180 N Jefferson Street,2019-12-01
99,7012695921,,https://chicago.craigslist.org/chc/apa/d/chica...,$2280 / 1br - 800ft2 - Luxury West Loop 1 br ...,Luxury West Loop 1 br with parking included,West Loop,1.0,800.0,2280,"<section id=""postingbody"">\n <br>\n<br>...","Unit features soaring 9' ceilings, south-facin...",Green near Monroe,2019-11-15


In [25]:
# post_df.dtypes

In [26]:
post_df['body_text'].apply(len).describe()

count     101.000000
mean     1255.742574
std       725.051911
min         0.000000
25%       695.000000
50%      1276.000000
75%      1643.000000
max      2779.000000
Name: body_text, dtype: float64

In [27]:
# Get all of the words in the body_text
post_df['word_list'] = post_df['body_text'].apply(str.lower).apply(str.split).apply(sorted)
post_df

Unnamed: 0,id,repost_id,url,full_title,short_title,hood,num_bedrooms,sqftage,price,body_html,body_text,address,available_date,word_list
0,7007855615,,https://chicago.craigslist.org/chc/apa/d/chica...,$1677 Brand New Studio in South Loop! Lease up...,Brand New Studio in South Loop! Lease up! In-u...,Chicago - South Loop,,,1677,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 115504620Rent: 1677 / Month...,South Wells near Polk,,"[#481.011892), (just, (some, (weight, -broker,..."
1,7006440536,,https://chicago.craigslist.org/chc/apa/d/chica...,$1750 / 1br - Amazing 1 bed in Fulton River w...,Amazing 1 bed in Fulton River w Great Views an...,Chicago - Fulton River District,1.0,,1750,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 9049569Rent: 1750 / MonthBe...,North Des Plaines St near Fulton,,"[#481.011892), (dog, (if,, (just, (weight, -br..."
2,7004840465,6925291786,https://chicago.craigslist.org/chc/apa/d/chica...,"$3310 / 1br - 895ft2 - Massive 1 Bed, 895 Sq ...","Massive 1 Bed, 895 Sq Feet, 1 MONTH FREE, Stun...",River North / Old Town / Gold Coast,1.0,895.0,3310,"<section id=""postingbody"">\n <br>\n<br>...",Stunning chef's kitchen; custom stone countert...,,2019-10-22,"[(and, +, +, +, +, +, -, -, --, --, --, --, 10..."
3,7007224782,,https://chicago.craigslist.org/chc/apa/d/chica...,$2250 / 2br - 1200ft2 - Luxury Lofts Availabl...,Luxury Lofts Available for rent! Contact Us NO...,Chicago,2.0,1200.0,2250,"<section id=""postingbody"">\n <br>\nRent...",Rent: $2250\nBeds: 2\nBath: 2\nAvailable Date:...,S Dearborn St.,2019-11-01,"[$2,250, $2250, $65, &, -, -, -, -, -, 1, 108t..."
4,7009561889,,https://chicago.craigslist.org/chc/apa/d/chica...,"$2095 / 1br - 750ft2 - 1 Bd on West Grand, Ha...","1 Bd on West Grand, Hardwood Floors, Furnished...",Chicago - West Loop,1.0,750.0,2095,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 184284042Rent: 2095 / Month...,645 West Grand,2019-10-29,"[$100/month., /, 1, 1, 1, 184284042rent:, 1ava..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,7012705471,,https://chicago.craigslist.org/chc/apa/d/chica...,$2595 / 1br - First class new construction,First class new construction,West Loop,1.0,,2595,"<section id=""postingbody"">\n <h4>PROPER...",PROPERTY INFO\nID: 129314294Rent: 2595 / Month...,West Madison,2019-11-03,"[(garage), *pricing, /, 129314294rent:, 1avail..."
97,7005590988,,https://chicago.craigslist.org/chc/apa/d/chica...,$2600 / 1br - 900ft2 - Fantastic 1bed/1bath+D...,Fantastic 1bed/1bath+Den in the Emerald Towers...,Chicago-West Loop,1.0,900.0,2600,"<section id=""postingbody"">\n <br>\nUnit...",Unit Features:\n\nONE BEDROOM + DEN IN W/ AMAZ...,125 S. Green Street,2019-12-01,"[$200, +, -, -, 12/1, 13'', 2, 60647, 7x10, a/..."
98,7005915714,6979828575,https://chicago.craigslist.org/chc/apa/d/chica...,$3590 / 2br - 1200ft2 - Furnished Sleek West ...,"Furnished Sleek West Loop w/ Gym, Pool, W/D, n...",West Loop/ Chicago,2.0,1200.0,3590,"<section id=""postingbody"">\n <br>\n<br>...","12+ month lease: $3,990* / mo\n6 to 12 month l...",180 N Jefferson Street,2019-12-01,"[#chi123), $3,590*, $3,990*, $3,990/, $4,090*,..."
99,7012695921,,https://chicago.craigslist.org/chc/apa/d/chica...,$2280 / 1br - 800ft2 - Luxury West Loop 1 br ...,Luxury West Loop 1 br with parking included,West Loop,1.0,800.0,2280,"<section id=""postingbody"">\n <br>\n<br>...","Unit features soaring 9' ceilings, south-facin...",Green near Monroe,2019-11-15,"[(the, 1, 15, 24, 9', 90/94/290,, a, a, a, acc..."


In [36]:
# Function to keep only alphabetic strings as words
def keep_alpha(a_list):
    w_list = []
    for w in a_list:
        if w.isalpha():
            w_list.append(w)
    
    return w_list

In [28]:
# Function to count the number of words in a list and return as a dictionary
def count_words(a_list):
    wc_dict = {}
    for w in a_list:
        if w in wc_dict.keys():
            wc_dict[w] += 1
        else:
            wc_dict[w] = 1
    
    return wc_dict

In [64]:
post_df['word_list']

0      [a, a, a, a, a, a, a, ad, addition, additional...
1      [a, a, a, a, accepted, access, ad, additional,...
2      [actual, advertised, amenity, and, and, and, a...
3      [a, a, adult, amenities, and, and, and, applic...
4      [additional, agent, an, and, any, apartment, a...
                             ...                        
96     [acceptedceiling, aircentral, along, amazing, ...
97     [additional, amazing, an, an, and, and, availa...
98     [a, a, a, a, a, a, a, a, access, additional, a...
99     [a, a, a, access, additional, also, also, and,...
100    [a, a, a, a, a, a, a, a, access, additional, a...
Name: word_list, Length: 101, dtype: object

In [65]:
post_df['word_list'] = post_df['word_list'].apply(keep_alpha)
post_df['word_list']

0      [a, a, a, a, a, a, a, ad, addition, additional...
1      [a, a, a, a, accepted, access, ad, additional,...
2      [actual, advertised, amenity, and, and, and, a...
3      [a, a, adult, amenities, and, and, and, applic...
4      [additional, agent, an, and, any, apartment, a...
                             ...                        
96     [acceptedceiling, aircentral, along, amazing, ...
97     [additional, amazing, an, an, and, and, availa...
98     [a, a, a, a, a, a, a, a, access, additional, a...
99     [a, a, a, access, additional, also, also, and,...
100    [a, a, a, a, a, a, a, a, access, additional, a...
Name: word_list, Length: 101, dtype: object

In [135]:
# Create a DataFrame for each word and number of occurrences per post
pw_raw_df = pd.DataFrame( list( post_df['word_list'].apply(count_words) ) )
pw_df = pd.DataFrame( list( post_df['word_list'].apply(count_words) ) ).fillna(0)
pw_df

Unnamed: 0,a,ad,addition,additional,all,amazing,an,and,any,apartment,...,metra,nicest,starbucks,walgreens,bring,comfortable,knowing,relax,thoughtfully,vibes
0,7.0,1.0,1.0,1.0,3.0,1.0,3.0,15.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,1.0,0.0,2.0,2.0,0.0,3.0,17.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,8.0,0.0,0.0,1.0,2.0,0.0,1.0,12.0,2.0,5.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
99,3.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
# Basic statistics
pw_df.describe()

Unnamed: 0,a,ad,addition,additional,all,amazing,an,and,any,apartment,...,metra,nicest,starbucks,walgreens,bring,comfortable,knowing,relax,thoughtfully,vibes
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,...,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,1.594059,0.059406,0.029703,0.267327,0.534653,0.138614,0.49505,5.80198,0.267327,0.881188,...,0.019802,0.019802,0.019802,0.019802,0.019802,0.019802,0.019802,0.019802,0.019802,0.019802
std,2.267943,0.237562,0.170613,0.527088,0.794536,0.347267,0.923296,4.558552,0.733363,1.088918,...,0.140014,0.140014,0.140014,0.140014,0.140014,0.140014,0.140014,0.140014,0.140014,0.140014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,0.0,1.0,0.0,1.0,8.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,1.0,1.0,2.0,3.0,1.0,4.0,17.0,3.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [117]:
pwinfo_df = pw_df.describe().transpose().sort_index()
pwinfo_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
a,101.0,1.594059,2.267943,0.0,0.0,1.0,2.0,9.0
aberdeen,101.0,0.059406,0.276470,0.0,0.0,0.0,0.0,2.0
about,101.0,0.138614,0.347267,0.0,0.0,0.0,0.0,1.0
above,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0
abundance,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
zelle,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0
zip,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0
zipcar,101.0,0.019802,0.140014,0.0,0.0,0.0,0.0,1.0
zureikat,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0


In [118]:
# pwc_df = pd.DataFrame( pw_df.apply(sum), columns=["word_count"] ).sort_values(by='word_count', ascending=False)
# pwc_df
pwinfo_df['word_count'] = pd.DataFrame( pw_df.apply(sum), columns=["word_count"] ).sort_index()
pwinfo_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,word_count
a,101.0,1.594059,2.267943,0.0,0.0,1.0,2.0,9.0,161.0
aberdeen,101.0,0.059406,0.276470,0.0,0.0,0.0,0.0,2.0,6.0
about,101.0,0.138614,0.347267,0.0,0.0,0.0,0.0,1.0,14.0
above,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0,1.0
abundance,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
zelle,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0,1.0
zip,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0,1.0
zipcar,101.0,0.019802,0.140014,0.0,0.0,0.0,0.0,1.0,2.0
zureikat,101.0,0.009901,0.099504,0.0,0.0,0.0,0.0,1.0,1.0


In [122]:
pwinfo_df[ pwinfo_df['word_count'] > 150 ].sort_values(by='word_count', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,word_count
and,101.0,5.80198,4.558552,0.0,3.0,5.0,8.0,17.0,586.0
the,101.0,4.19802,4.298883,0.0,1.0,4.0,5.0,16.0,424.0
to,101.0,3.693069,3.724896,0.0,2.0,3.0,5.0,21.0,373.0
in,101.0,2.594059,2.055131,0.0,1.0,2.0,4.0,8.0,262.0
with,101.0,2.415842,2.654307,0.0,0.0,2.0,3.0,11.0,244.0
of,101.0,1.980198,2.044408,0.0,0.0,2.0,3.0,10.0,200.0
for,101.0,1.891089,1.355736,0.0,1.0,2.0,3.0,5.0,191.0
is,101.0,1.782178,2.886534,0.0,0.0,1.0,2.0,14.0,180.0
a,101.0,1.594059,2.267943,0.0,0.0,1.0,2.0,9.0,161.0


In [129]:
# create a mask to determine the number of samples (rows) where a word had count > 0
pd.DataFrame( pw_df.apply(sum), columns=["word_count"] ).sort_index()

Unnamed: 0,word_count
a,161.0
aberdeen,6.0
about,14.0
above,1.0
abundance,1.0
...,...
zelle,1.0
zip,1.0
zipcar,2.0
zureikat,1.0


In [165]:
# Sum of word counts across all samples
pw_raw_df[ pw_raw_df.notna() ].apply(np.sum)

a               161.0
ad                6.0
addition          3.0
additional       27.0
all              54.0
                ...  
comfortable       2.0
knowing           2.0
relax             2.0
thoughtfully      2.0
vibes             2.0
Length: 1619, dtype: float64

In [170]:
# Number of samples in which the word is present
pw_raw_df[ pw_raw_df.notna() ].count()

a               52
ad               6
addition         3
additional      23
all             38
                ..
comfortable      2
knowing          2
relax            2
thoughtfully     2
vibes            2
Length: 1619, dtype: int64

In [172]:
# Statistics for each word in cases where the word is present in a sample post
pw_raw_df[ pw_raw_df.notna() ].describe().transpose().sort_index()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
a,52.0,3.096154,2.311604,1.0,1.0,2.0,4.0,9.0
aberdeen,5.0,1.200000,0.447214,1.0,1.0,1.0,1.0,2.0
about,14.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0
above,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
abundance,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
zelle,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
zip,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
zipcar,2.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0
zureikat,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0
