In [2]:
import csv
import re

with open("A1_dataset.csv", encoding="utf8", newline='') as file:
    data = csv.reader(file)
    next(data)
    data_list = list(data)

In [3]:
negative_sent = []
positive_sent = []

for line in data_list:
    if line[0][0] == "0":
        negative_sent.append(line)

    else:
        positive_sent.append(line)

Regex used for tokenization = ((https?|www).+\w+|[@]?\w+|(\.+)|[^\w\s])

The above regex considers it as a token if one of the following coniditon is matched
1. Starts with an http,www,https (handles urls)
2. Is a word, can start with '@' (counts usernames as single tokens)
3. Any character other than above specified

In [4]:
reg_exp = r"((https?|www).+\w+\.([\w])+[^\s]+)|[@]?\w+|(\.+)|[^\w\s]"

total_negative_tokens = 0
total_positive_tokens = 0

unique_negative_tokens = set()
unique_positive_tokens = set()

for tweet in negative_sent:
    tokens = re.finditer(reg_exp, tweet[2], re.MULTILINE)
    tokens_list = list(tokens)

    for match in tokens_list:
        unique_negative_tokens.add(match.group())

    total_negative_tokens += len(tokens_list)
	

for tweet in positive_sent:
    tokens = re.finditer(reg_exp, tweet[2], re.MULTILINE)
    tokens_list = list(tokens)

    for match in tokens_list:
        unique_positive_tokens.add(match.group())

    total_positive_tokens += len(tokens_list)

average_negative_tokens = total_negative_tokens/len(negative_sent)
average_positive_tokens = total_positive_tokens/len(positive_sent)


print(f"Average tokens in negative tweets = {average_negative_tokens} tokens/tweet")
print(f"Average tokens in positive tweets = {average_positive_tokens} tokens/tweet")


Average tokens in negative tweets = 16.588 tokens/tweet
Average tokens in positive tweets = 16.06077831219939 tokens/tweet


Regex for sentences = "[^.!?\s][^.!?\n]*(https?|www.+\w+|[@]?\w+|\.+|[^\w\s]*)"

In [5]:
reg_exp = r'[^.!?\s][^.!?\n]*(https?|www.+\w+|[@]?\w+|\.+|[^\w\s]*)'

total_negative_sentences = 0
total_positive_sentences = 0

positive_sentences = []
negative_sentences = []

for tweet in negative_sent:
    sentences = re.findall(reg_exp, tweet[2])
    total_negative_sentences += len(sentences)
    
    for sentence in sentences:
        negative_sentences.append(sentence)


for tweet in positive_sent:
    sentences = re.findall(reg_exp, tweet[2])
    total_positive_sentences += len(sentences)

    for sentence in sentences:
        positive_sentences.append(sentence)

average_negative_sentences = total_negative_sentences/len(negative_sent)
average_positive_sentences = total_positive_sentences/len(positive_sent)

print(f"Average sentences in negative tweets = {average_negative_sentences} sentences/tweet")
print(f"Average sentences in positive tweets = {average_positive_sentences} sentences/tweet")

Average sentences in negative tweets = 1.85 sentences/tweet
Average sentences in positive tweets = 1.9610843900306079 sentences/tweet


In [6]:
# reg = "r'\b(?<!-)[aeiouAEIOU][-\w']*(?!@)\b'"
words_v_n = 0 #No of words starting with a vowel in negative class
words_c_n = 0 #No of words starting with a consonant in negative class
words_v_p = 0 #No of words starting with a vowel in positive class
words_c_p = 0 #No of words starting with a consonant in positive class

for i in range(len(negative_sent)): #No of words starting with a vowel in negative class
    tokens = re.findall(r"\b(?<![-@])[aeiouAEIOU][-a-zA-Z']*(?!@)\b", negative_sent[i][2])
    words_v_n += len(tokens)

for i in range(len(positive_sent)): #No of words starting with a vowel in positive class
    tokens = re.findall(r"\b(?<![-@])[aeiouAEIOU][-a-zA-Z']*(?!@)\b", positive_sent[i][2])
    words_v_p += len(tokens)
    
for i in range(len(negative_sent)): #No of words starting with a consonant in negative class
    tokens = re.findall(r"\b(?<![-@.'])[^-\saeiouAEIOU@.//:'\\)\d][-a-zA-Z']*(?!.com)\b", negative_sent[i][2])
    words_c_n += len(tokens)
    # print(tokens)

for i in range(len(positive_sent)): #No of words starting with a consonant in positive class
    tokens = re.findall(r"\b(?<![-@.'])[^-\saeiouAEIOU@.:'\\//)d][-a-zA-Z']*(?!.com)\b", positive_sent[i][2])
    words_c_p += len(tokens)
    # print(tokens)

print("No of words starting with a vowel in negative class = " , words_v_n)
print("No of words starting with a vowel in positive class = ", words_v_p)

print("No of words starting with a consonant in negative class = ", words_c_n)
print("No of words starting with a consonant in positive class = ", words_c_p)



No of words starting with a vowel in negative class =  6867
No of words starting with a vowel in positive class =  7010
No of words starting with a consonant in negative class =  19039
No of words starting with a consonant in positive class =  20245


## Lower casing all the text

In [7]:
def lower_text(result):
    return result.group(0).lower()

negative_sent_lower = []
positive_sent_lower = []

for tweet in negative_sent:
    new_text = re.sub("[A-Z]", lower_text, tweet[2])
    negative_sent_lower.append(new_text)

for tweet in positive_sent:
    new_text = re.sub("[A-Z]", lower_text, tweet[2])
    positive_sent_lower.append(new_text)



reg_exp = r"((https?|www).+\w+\.([\w])+[^\s]+)|[@]?\w+|(\.+)|[^\w\s]"

unique_negative_tokens_after_lowercase = set()
unique_positive_tokens_after_lowercase = set()

for tweet in negative_sent_lower:
    tokens = re.finditer(reg_exp, tweet, re.MULTILINE)
    tokens_list = list(tokens)

    for match in tokens_list:
        unique_negative_tokens_after_lowercase.add(match.group())

for tweet in positive_sent_lower:
    tokens = re.finditer(reg_exp, tweet, re.MULTILINE)
    tokens_list = list(tokens)

    for match in tokens_list:
        unique_positive_tokens_after_lowercase.add(match.group())

print("Negatives")
print("Unique tokens before lowercase = ", len(unique_negative_tokens))
print("Unique tokens after lowercase = ", len(unique_negative_tokens_after_lowercase))

print("Positives")
print("Unique tokens before lowercase = ", len(unique_positive_tokens))
print("Unique tokens after lowercase = ", len(unique_positive_tokens_after_lowercase))


Negatives
Unique tokens before lowercase =  6448
Unique tokens after lowercase =  5655
Positives
Unique tokens before lowercase =  7900
Unique tokens after lowercase =  6875


## Counting and listing all usernames

In [8]:
reg_exp = "[@]\w+"

total_usernames_in_negative = 0
total_usernames_in_positive = 0

usernames_in_negative = []
usernames_in_positive = []

for tweet in negative_sent:
    usernames = re.findall(reg_exp, tweet[2])
    
    if len(usernames) != 0:

        total_usernames_in_negative += len(usernames)

        for user in usernames:
            usernames_in_negative.append(user)


for tweet in positive_sent:
    usernames = re.findall(reg_exp, tweet[2])
    
    if len(usernames) != 0:

        total_usernames_in_positive += len(usernames)

        for user in usernames:
            usernames_in_positive.append(user)

print("Total usernames in negative: ", total_usernames_in_negative)
#print(usernames_in_negative)

print("Total usernames in positive: ",total_usernames_in_positive)
#print(usernames_in_positive)

Total usernames in negative:  803
Total usernames in positive:  1305


## Counting and listing all the URLs

In [9]:
reg_exp = "((https?|www[\.][^\.\s])[^\s]+)"

total_url_in_negative = 0
total_url_in_positive = 0

urls_in_negative = []
urls_in_positive = []

for tweet in negative_sent:
    urls = re.findall(reg_exp, tweet[2])
    
    if len(urls) != 0:
        total_url_in_negative += len(urls)
        for i in urls:
            urls_in_negative.append(i[0])

for tweet in positive_sent:
    urls = re.findall(reg_exp, tweet[2])
    
    if len(urls) != 0:
        total_url_in_positive += len(urls)
        for i in urls:
            urls_in_positive.append(i[0])

print("Total URLs in negative: ", total_url_in_negative)
#print(urls_in_negative)

print("Total URLs in positive: ",total_url_in_positive)
#print(urls_in_positive)

Total URLs in negative:  60
Total URLs in positive:  136


## Number of tweets for each day of week

In [10]:

negative_sent_days = {'Mon':0, 'Tue':0, 'Wed':0, 'Thu':0, 'Fri':0, 'Sat':0, 'Sun':0}
positive_sent_days = {'Mon':0, 'Tue':0, 'Wed':0, 'Thu':0, 'Fri':0, 'Sat':0, 'Sun':0}

for tweet in negative_sent:

    all_matches = re.finditer("^Mon|Tue|Wed|Thu|Fri|Sat|Sun", tweet[1])
    for match in list(all_matches):
        negative_sent_days[match.group()] += 1

for tweet in positive_sent:

    all_matches = re.finditer("^Mon|Tue|Wed|Thu|Fri|Sat|Sun", tweet[1])
    for match in list(all_matches):
        positive_sent_days[match.group()] += 1

print("For negative tweets")
print(negative_sent_days)

print("For positive tweets")
print(positive_sent_days)

For negative tweets
{'Mon': 391, 'Tue': 154, 'Wed': 127, 'Thu': 171, 'Fri': 473, 'Sat': 119, 'Sun': 565}
For positive tweets
{'Mon': 481, 'Tue': 132, 'Wed': 172, 'Thu': 50, 'Fri': 391, 'Sat': 298, 'Sun': 763}


# Part B

In [17]:
#word = input("Enter a word: ")
#class_label = input("Enter class label: ")

reg_exp = r'[^.!?\s][^.!?\n]*(https?|www.+\w+|[@]?\w+|\.+|[^\w\s]*)'


negative_sentences = []

for tweet in negative_sent:
    sentences = re.finditer(reg_exp, tweet[2])
    #print(list(sentences))


[<re.Match object; span=(0, 33), match='About to get threaded and scared '>]
[<re.Match object; span=(0, 36), match='Needs to shake this gloomy feeling!!'>, <re.Match object; span=(40, 62), match="Maybe it's the rain???">]
[<re.Match object; span=(0, 18), match='Minecart ride now.'>, <re.Match object; span=(19, 73), match="Sarah's still too afraid to ride anything fun wit>]
[<re.Match object; span=(0, 27), match='@sokendrakouture yea alone '>]
[<re.Match object; span=(0, 37), match='@flyingbolt  Not as good without you!'>]
[<re.Match object; span=(0, 46), match="LOL HELP ME, I'm obsessed with Hannah Montana ">]
[<re.Match object; span=(0, 45), match='I hate when people have to move away from me '>]
[<re.Match object; span=(0, 11), match='http://bit.'>, <re.Match object; span=(11, 40), match='ly/AEbs3   I can only be sad.'>, <re.Match object; span=(42, 55), match='#iranelection'>]
[<re.Match object; span=(0, 112), match="@digitallearnin Heppell's opening sentences descr>]
[<re.Match obj