This data set contains the top 1,000 questions users posted to AskReddit in 2015.
There are five columns:

Columns | Description
---|---
Title | The title of the post
Score | The number of upvotes the post received
Time | When the post was posted
Gold | How much Reddit Gold users gave the post
NumComs | The number of comments the post received

# About regular expression

In [7]:
# read the data set and display the first 4 rows.

import csv
postswithheader = list(csv.reader(open('askreddit_2015.csv','r',encoding='utf8')))
posts = postswithheader[1:]
print(posts[:4])

[['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?', '11510', '1433213314', '1', '26195'], ["What's your favorite video that is 10 seconds or less?", '8656', '1434205517', '4', '8479'], ['What are some interesting tests you can take to find out about yourself?', '8480', '1443409636', '1', '4055'], ["PhD's of Reddit. What is a dumbed down summary of your thesis?", '7927', '1440188623', '0', '13201']]


In [11]:
# some questions are for certain group of people, they contain expression like 'doctors of Reddit', 'PhD's of raddit'.
# We will count how many questions have 'of reddit' or 'of Reddit'

import re
of_reddit_count = 0
for row in posts:
    if re.search('of [Rr]eddit',row[0]) is not None:    # re.search() results in None or a match object, [] in regex
        of_reddit_count +=1
print("Number of questions containing 'of reddit':", of_reddit_count)

Number of questions containing 'of reddit': 102


In [20]:
# Some questions contain [Serious] tag. We would like to know how many questions have this tag.
# However if we [Serious] in regex, the interpreter will match any string that contains 'S', 'e','r', etc.
# As a result, we will use '\' to escape special characters.
# We will only count the questions with the tag at the beginning and end of the questions.

serious_count = 0
for row in posts:
    if re.search('^[\[\(][Ss]erious[\]\)]|[\[\(][Ss]erious[\]\)]$', row[0]) is not None:  # using \,^, $, and |
        serious_count +=1
print('Number of questions start/end with [Serious], [serious],(Serious), or (serious):', serious_count)

Number of questions start/end with [Serious], [serious],(Serious), or (serious): 80


In [21]:
# Replace "[serious]", "(Serious)", and "(serious)" with "[Serious]" for all of the titles in posts.
# using re.sub()

for row in posts:
    re.sub('[\[\(][Ss]erious[\]\)]','[Serious]',row[0])  #the 2nd argument is a string, no need to use escape.

In [25]:
# How to extract the years from the below strings?

dates = ["Jan 17, 2012","9/22/2005","Spring 2007","New Year's Eve 1999"]
years =[]
for item in dates:
    years.append(re.findall('[1-2][0-9]{3}',item))    # re.findall(regex, strings), [1-2], {3}
print(years)

[['2012'], ['2005'], ['2007'], ['1999']]


# About time and date

In [30]:
import time
current_time = time.time()                       # Unix timestamps
current_struct_time = time.gmtime(current_time)  # create an instance of the struct_time class
current_hour = current_struct_time.tm_hour # use.tm_hour method to get the current hour in UTC (Coordinated Universal Time).
current_year1 = current_struct_time.tm_year
print(current_year1)

import datetime                                  # works better with dates for arithmetic
current_datetime = datetime.datetime.utcnow()    # the datatime module has a daytime class. Make an instance, module.class.method()
current_year2 = current_datetime.year
print(current_year2)                             # same result as time.gmtime(time.time).tm_year

# Perform arithmatic on dates:  datetime module has the timedelta class, the instace represents a span of time
# Them add or subtract it from instances of the datetime class.
today = datetime.datetime.now()                  # first, make an instance of the datetime
diff = datetime.timedelta(weeks = 3, days = 2)   # make an instance of timedelta class that represent the span of time
future = today + diff                            # the date of 3 weeks and 2 days from now

# Print datetime object, customize the string output
march3 = datetime.datetime(year = 2010, month = 3, day = 3)      #store a timestamp in march3, the arguments have no ''
pretty_march3 = march3.strftime('%b %d, %Y')                     # datetime.datetime.strftime(arguments) https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
march3 = datetime.datetime.strftime('Mar 03,2010', '%b %d,%Y')   # convert the string back to a datetime object                                                


2019
2019


In [31]:
# convert the Unix timestamps data in the time column to human readable strings
for row in posts:
    row[2] = datetime.datetime.fromtimestamp(float(row[2]))  #datetime.datetime.fromtimestamp(float)
                                                             # now row[2] is an instance of datetime
print(posts[0][2])

2015-06-01 22:48:34


In [35]:
# How many questions of the top 1000 questions were submitted in each month?
def postmonth(month):
    month_count = 0
    for row in posts:
        if row[2].month == month:
            month_count +=1
    return month_count
print(postmonth(3))

59
