The original blog post for this notebook is at: https://umar-yusuf.blogspot.com.ng/2016/12/nairaland-christmas-birthday-analyzed.html

In [1]:
# import the libraries we are going to use

# libraries for Scraping and Cleaning the data
import re
import requests
from bs4 import BeautifulSoup


# libraries for Analyzing and Visualizing the data
import pandas as pd
from datetime import datetime

In [2]:
# Scraping out the raw html code of nairaland home page
url = "http://www.nairaland.com/home"
raw_html = requests.get(url) # returns the complete url html code

# print (raw_html.text)

raw_data = raw_html.text  # save the text in an object

soup_data = BeautifulSoup(raw_data, "lxml") # use BeautifulSoup module read the html into xml to and save it in an object

In [3]:
# lets display only the part of the data we need. It is contained in the cell of table tag (<td>)

soup_data("td")

[<td class="grad"><h1><a class="g" href="http://www.nairaland.com/?" title="Nairaland Nigerian Forum">₦airaland Forum</a></h1> Welcome, <b>Guest</b>: <a href="/register">Join Nairaland</a> / <b><a href="/login">LOGIN!</a></b> / <a href="/trending">Trending</a> / <a href="/recent">Recent</a> / <a href="/topics">New</a><br/><b>Stats: </b>1,714,341 members, 3,274,303 topics. <b>Date</b>: Sunday, 25 December 2016 at 08:41 PM<p></p><form action="/search"> <input name="q" size="32" type="text"/>
 <input name="search" type="submit" value="Search"/></form> </td>,
 <td class="l w"><a href="/nairaland" title=" class=g"><b>Nairaland / General</b></a>: <a href="/politics" title="Our country Nigeria is the giant of Africa!"><b>Politics</b></a>, <a href="/crime" title=""><b>Crime</b></a>, <a href="/romance" title="Discuss dating, courtship, and romance in marriage."><b>Romance</b></a>, <a href="/jobs" title="Job/Employment Opportunities; Vacancies In Nigeria!"><b>Jobs/Vacancies</b></a>, <a href="/ca

In [4]:
# lets read out the text only ignoring the tag cell in a table
for data in soup_data("td"):
    print (data.text)

₦airaland Forum Welcome, Guest: Join Nairaland / LOGIN! / Trending / Recent / NewStats: 1,714,341 members, 3,274,303 topics. Date: Sunday, 25 December 2016 at 08:41 PM 
 
Nairaland / General: Politics, Crime, Romance, Jobs/Vacancies, Career, Business, Investment, NYSC, Education, Autos, Car Talk, Properties, Health, Travel, Family, Culture, Religion, Food, Diaries, Nairaland Ads, Pets, Agriculture
Entertainment: Jokes Etc, TV/Movies, Music/Radio, Celebrities, Fashion, Events, Sports, Gaming, Forum Games, Literature
Science/Technology: Programming, Webmasters, Computers, Phones, Art, Graphics & Video, Technology Market
 Featured Links / Twitter  / Facebook  / How To Advertise 
 » Civilian JTF Reveal The 'Real' Flag Of Shekau Rumoured To Have Been Captured (Pics) «» Tiwa Savage Shares Cute Photo Of Her Son, Jamil In His G-Wagon «» President Buhari Receives Christmas Gift From CAN (Photos) «» See The Sad Incident That Happened Today At Cameroon-Nigeria Border (Graphic Pics) «» 2016: 1.7 M

In [5]:
# Obviously, we don't need every text above. So use the 're' module, to extract only the relevant birthday list

# Note: I will ignore those members whose ages are not displayed, so that we don't have to deal with NaN values in our data


member_found = None

re_match = "[\w]+\([\d]+\)" # any word count+1 followed-by '(' followed-by any number count+1 followed-by ')'

for data in soup_data("td"):
    data_found = re.findall(re_match, data.text)
    
    if data_found:
        member_found = data_found

print (member_found)

['PoDeep(26)', 'topeal(38)', 'k9ine(36)', 'Chriz(32)', 'odunbabe(38)', 'ptaller(38)', 'bomboyi(43)', 'fem88(33)', 'Hydy(26)', 'chubysoft1(31)', 'emmydex(30)', 'emzygab(33)', 'emmboy(25)', 'oluwaseyiwhyte(27)', 'beordune(33)', 'Cvesta(25)', 'lukma227(34)', 'Ifeanyie12(31)', 'Henesi2010(38)', 'gbadaniyi(35)', 'emmajew(44)', 'ghetto101(28)', 'gbudujames(37)', 'Bashirfuntua(29)', 'christygenius(31)', 'dontinero(36)', 'Ezyoung(23)', 'Jraph(26)', 'savcy(28)', 'Blofeld(36)', 'madridsta007(27)', 'tpalan(30)', 'Uniquexty(26)', 'abelee(32)', 'solidofor(27)', 'NuellaJ(34)', 'onysolz(28)', 'spenx084(31)', 'Lucasbalo(37)', 'dandig(23)', 'abbeyie007(34)', 'zees(23)', 'jaybwoi(35)', 'tyson99(36)', 'Ike77503(38)', 'Manxcopido(27)', 'missjo(32)', 'abinsco25(36)', 'Akinoau(26)', 'badmuspeace(31)', 'abbeyoye2001(38)', 'uzowulu8(23)', 'Ping411(28)', 'McAustin92(24)', 'Okeikpu(34)', 'djpop100(21)', 'bboy4sure(29)', 'chrisantus25(24)', 'sunhosting(30)', 'Omowalksola(20)', 'jemmanuella97(26)', 'abelincon(32)

In [6]:
# Lets further clean up the list to seperate Usernames from age

# Use list comprehension to replace the last brace ")" with empty "" in member_found above


member_found_replaced = [x.replace(")", "") for x in member_found]            # replaces ")" by ""

print (member_found_replaced)

['PoDeep(26', 'topeal(38', 'k9ine(36', 'Chriz(32', 'odunbabe(38', 'ptaller(38', 'bomboyi(43', 'fem88(33', 'Hydy(26', 'chubysoft1(31', 'emmydex(30', 'emzygab(33', 'emmboy(25', 'oluwaseyiwhyte(27', 'beordune(33', 'Cvesta(25', 'lukma227(34', 'Ifeanyie12(31', 'Henesi2010(38', 'gbadaniyi(35', 'emmajew(44', 'ghetto101(28', 'gbudujames(37', 'Bashirfuntua(29', 'christygenius(31', 'dontinero(36', 'Ezyoung(23', 'Jraph(26', 'savcy(28', 'Blofeld(36', 'madridsta007(27', 'tpalan(30', 'Uniquexty(26', 'abelee(32', 'solidofor(27', 'NuellaJ(34', 'onysolz(28', 'spenx084(31', 'Lucasbalo(37', 'dandig(23', 'abbeyie007(34', 'zees(23', 'jaybwoi(35', 'tyson99(36', 'Ike77503(38', 'Manxcopido(27', 'missjo(32', 'abinsco25(36', 'Akinoau(26', 'badmuspeace(31', 'abbeyoye2001(38', 'uzowulu8(23', 'Ping411(28', 'McAustin92(24', 'Okeikpu(34', 'djpop100(21', 'bboy4sure(29', 'chrisantus25(24', 'sunhosting(30', 'Omowalksola(20', 'jemmanuella97(26', 'abelincon(32', 'Bensmart04(26', 'jakiedudu(33', 'joelenesleek(26', 'babino

In [7]:
# Now split "member_found_replaced" based on '(' between the usernames and age
# we use for loop to loop through each item of the "member_found_replaced" list above

for y in member_found_replaced:
    member_cleaned = y.split("(")
    print (member_cleaned)
    
# what we have "member_cleaned" is individual list with two elements each
# lets combine all the lists into a dictionary

['PoDeep', '26']
['topeal', '38']
['k9ine', '36']
['Chriz', '32']
['odunbabe', '38']
['ptaller', '38']
['bomboyi', '43']
['fem88', '33']
['Hydy', '26']
['chubysoft1', '31']
['emmydex', '30']
['emzygab', '33']
['emmboy', '25']
['oluwaseyiwhyte', '27']
['beordune', '33']
['Cvesta', '25']
['lukma227', '34']
['Ifeanyie12', '31']
['Henesi2010', '38']
['gbadaniyi', '35']
['emmajew', '44']
['ghetto101', '28']
['gbudujames', '37']
['Bashirfuntua', '29']
['christygenius', '31']
['dontinero', '36']
['Ezyoung', '23']
['Jraph', '26']
['savcy', '28']
['Blofeld', '36']
['madridsta007', '27']
['tpalan', '30']
['Uniquexty', '26']
['abelee', '32']
['solidofor', '27']
['NuellaJ', '34']
['onysolz', '28']
['spenx084', '31']
['Lucasbalo', '37']
['dandig', '23']
['abbeyie007', '34']
['zees', '23']
['jaybwoi', '35']
['tyson99', '36']
['Ike77503', '38']
['Manxcopido', '27']
['missjo', '32']
['abinsco25', '36']
['Akinoau', '26']
['badmuspeace', '31']
['abbeyoye2001', '38']
['uzowulu8', '23']
['Ping411', '28']


In [8]:
# we first declare "member_cleaned" as empty dictiory, so we can append individaul list above into it

member_cleaned = {}

for y in member_found_replaced:
    temp_data = y.split("(")
    
    member_cleaned[temp_data[0]] = int(temp_data[1])
    
print (member_cleaned)

{'47xxx': 21, 'WalleyDon': 38, 'ceecane2': 31, 'muusa81': 35, 'zeentah': 26, 'joelenesleek': 26, 'dontinero': 36, 'BARRYVN2016': 28, 'abelincon': 32, 'Ezyoung': 23, 'athaboi': 35, 'elclark': 27, 'Ifeanyie12': 31, 'iKON11': 26, 'uzowulu8': 23, 'odapey': 20, 'Orusuangama666': 27, 'brownemmanuel43': 36, 'PoDeep': 26, 'Prowess95': 21, 'bvgfhrht': 33, 'chrisantus25': 24, 'Abbey1983': 33, 'k9ine': 36, 'odunbabe': 38, 'lifah': 24, 'aoyih': 29, 'madridsta007': 27, 'missjo': 32, 'AhmadGaladanchi': 24, 'Lucasbalo': 37, 'OmaHomes': 26, 'abeybest': 37, 'irevwoyeri': 22, 'Henesi2010': 38, 'kelvincoke': 26, 'brendan25': 26, 'chummyluv1': 26, 'gbadaniyi': 35, 'emmydollars4life': 34, 'emzyphynext': 17, 'badmuspeace': 31, 'Jesusbaby1': 26, 'praz001': 29, 'Hackportal': 21, 'Godpikin3': 22, 'godswilliz': 20, 'reallest': 23, 'Uniquexty': 26, 'Chriz': 32, 'odunola25': 26, 'SherylMiracle': 19, 'Topman0001': 33, 'McAustin92': 24, 'optimist25': 24, 'Omowalksola': 20, 'kickman': 57, 'jemmanuella97': 26, 'Littl

In [9]:
# covert the dictionary "member_cleaned" above into a Pandas DataFrame
# Note: in python 3, we have to convert the dictionary items into a list to work with Pandas DataFrame


# define the column names
columns_name = ["Username", "Age"]

# df = pd.DataFrame(member_cleaned.items(), columns = columns_name )   # this is for python 2
df = pd.DataFrame(list(member_cleaned.items()), columns = columns_name )

df

Unnamed: 0,Username,Age
0,47xxx,21
1,WalleyDon,38
2,ceecane2,31
3,muusa81,35
4,zeentah,26
5,joelenesleek,26
6,dontinero,36
7,BARRYVN2016,28
8,abelincon,32
9,Ezyoung,23


In [10]:
# Lets add a column for today's date

# using the datetime module


todays_date = datetime.now().date()

df["Date"] = todays_date

df

Unnamed: 0,Username,Age,Date
0,47xxx,21,2016-12-25
1,WalleyDon,38,2016-12-25
2,ceecane2,31,2016-12-25
3,muusa81,35,2016-12-25
4,zeentah,26,2016-12-25
5,joelenesleek,26,2016-12-25
6,dontinero,36,2016-12-25
7,BARRYVN2016,28,2016-12-25
8,abelincon,32,2016-12-25
9,Ezyoung,23,2016-12-25


In [11]:
# Let save the dataframe into csv file
# we name the csv file with the current date, i.e: 14/08/2016 will be 20160814 for the file name

csv_name = todays_date.strftime("%Y%m%d")

df.to_csv(csv_name + ".csv")

# Now our christmas birthday dataset is saved in a CSV file future use...

<img src="https://2.bp.blogspot.com/-uRvlKbI3luE/WGAd0im2A7I/AAAAAAAABbI/oXuHCVyH5r46K-KWCRHZUj7EL0SVtGUwQCLcB/s1600/images.jpg"  width='500px' />