In [284]:
import numpy as np
import pandas as pd
import datetime
from os import path
import json

In [285]:
def get_lines(data_path):
    count = 0
    with open(data_path, 'r', encoding = 'utf-8') as f:
        for line in f:
            count += 1
    return count

In [286]:
def read_tweet_file(file_path):
    with open (file_path, 'r') as f:
        event = list(f)
    return event

In [287]:
def linguistic_features(tweet):
    try:
        text = tweet['text']
    except:
        text = ""
    return text

In [288]:
def user_features(tweet):

    try:
        influence = tweet['user']['public_metrics']['followers_count'] # important
    except:
        influence = 0
        
    try:
        friend = tweet['user']['public_metrics']['following_count']
    except:
        friend = 0
  
    try:
        role = influence/(friend + 1e-10)
    except:
        role = 0

    try:
        credibility = 1 if tweet['user']['verified'] else 0 # important
    except: 
        credibility = 0
  
    try:
        lst = tweet['user']['public_metrics']['listed_count']
    except:
        lst = 0
    
    try:
        prof_desc = 1 if tweet['user'].get('description', None) is not None else 0
    except:
        prof_desc = 0
   
    try:
        prof_url = 1 if tweet['user']['entities'].get('url', None) is not None else 0 # important
    except:
        prof_url = 0
    
    try:
        create = tweet['created_at']
        account = tweet['user']['created_at']
        c = datetime.datetime(int(create[:4]), int(create[5:7]), int(create[8:10]))
        a = datetime.datetime(int(account[:4]), int(account[5:7]), int(account[8:10]))
        age = (c-a).days
    except:
        age = 0
    
    return [influence, role, credibility, lst, prof_desc, prof_url, age]

In [301]:
def test_user_features(tweet):

    try:
        influence = tweet['user']['followers_count'] # important
    except:
        influence = 0
        
    try:
        friend = tweet['user']['friends_count']
    except:
        friend = 0
  
    try:
        role = influence/(friend + 1e-10)
    except:
        role = 0

    try:
        credibility = 1 if tweet['user']['verified'] else 0 # important
    except: 
        credibility = 0
  
    try:
        lst = tweet['user']['listed_count']
    except:
        lst = 0
    
    try:
        prof_desc = 1 if tweet['user'].get('description', None) is not None else 0
    except:
        prof_desc = 0
   
    try:
        prof_url = 1 if tweet['user'].get('url', None) is not None else 0 # important
    except:
        prof_url = 0
    
    try:
        create = tweet['created_at']
        account = tweet['user']['created_at']
        c = datetime.datetime.strftime(datetime.datetime.strptime(create,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d')
        a = datetime.datetime.strftime(datetime.datetime.strptime(account,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d')
        age = (datetime.datetime(int(c[:4]), int(c[5:7]), int(c[8:10])) - datetime.datetime(int(a[:4]), int(a[5:7]), int(a[8:10]))).days
    except:
        age = 0
    
    return [influence, role, credibility, lst, prof_desc, prof_url, age]

In [290]:
def feature_extract(data_path, folder_path, save_path):

    count = get_lines(data_path)
    
    data = []
    for i in range(count):
        
        file_path = f'{folder_path}/{i}.json'
        
        if path.exists(file_path):

            event = read_tweet_file(file_path)

            # process source tweet
            tweet_source = json.loads(event[0])
            # extract user features
            if folder_path.endswith('test'):
                source_user = test_user_features(tweet_source)
            else:
                source_user = user_features(tweet_source)
            # extract linguistic features
            source_text = linguistic_features(tweet_source)

            # process follow-up tweets in an event
            user = [0] * 7
            text = ""
            for t in event[1:]:
                
                length = len(event) - 1
                
                tweet = json.loads(t)

                # extract user features
                if folder_path.endswith('test'):
                    tweet_user = test_user_features(tweet)
                else:
                    tweet_user = user_features(tweet)
                user = [x + y for x, y in zip(user, tweet_user)]

                # extract linguistic features
                tweet_text = linguistic_features(tweet)
                text = text + " " + tweet_text

            event_user = [x + y/length for x, y in zip(source_user, user)]
            event_text = source_text + " " + text

            tweet_all = [i] + event_user + [event_text]

            data.append(tweet_all)

        else:
            continue

    df = pd.DataFrame(data, columns = ['row', 'influence', 'role', 'credibility', 'list', 'profile_description', 'profile_url', 'age', 'text'])
    df.set_index('row', inplace = True, drop = True)
    print(df)

    df.to_csv(save_path) 

In [291]:
def get_label(label_path):
    target = []
    with open(label_path, 'r', encoding = 'utf-8') as f:
        for line in f:
            line = line.strip()
            lab = convert_label(line)
            target.append(lab)
    return target

def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "nonrumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'nonrumour'")

In [292]:
def append_label(label_path, save_path, save_path_label):
    
    data = pd.read_csv(save_path)
    target = get_label(label_path)
    
    row = data.row
    labels = []
    for r in row:
        label = target[r]
        labels.append(label)
    data['label'] = labels
    data.head()
    data.to_csv(save_path_label)

In [293]:
# train
    
data_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/train/train.data.txt'
folder_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/train/event-objects-train-updated'
save_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/train/train.csv'
save_path_label = '/Users/weimin/Downloads/NLP/project-data/data_obj/train/train_label.csv'
label_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/train/train.label.txt'

In [294]:
feature_extract(data_path, folder_path, save_path)

         influence          role  credibility           list  \
row                                                            
0     8.160000e+02      0.346203     0.000000       2.000000   
1     5.520169e+06   6251.467708     1.709677   35395.193548   
2     1.222000e+03      1.287671     0.000000       2.000000   
3     3.553883e+07  33338.021083     1.800000  225507.200000   
4     4.918000e+03      0.959984     0.000000       7.000000   
...            ...           ...          ...            ...   
1889  1.262000e+03      1.974961     0.000000       2.000000   
1890  2.256051e+05      3.862789     1.571429    2000.285714   
1891  2.778364e+07  56933.684746     1.593750  104322.093750   
1892  1.612269e+07   8672.777479     1.777778   89795.555556   
1894  3.027000e+03      2.002163     0.000000      40.000000   

      profile_description  profile_url          age  \
row                                                   
0                2.000000     0.000000  6470.000000   
1 

In [295]:
append_label(label_path, save_path, save_path_label)

In [296]:
# dev
    
data_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/dev/dev.data.txt'
folder_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/dev/event-objects-dev-updated'
save_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/dev/dev.csv'
save_path_label = '/Users/weimin/Downloads/NLP/project-data/data_obj/dev/dev_label.csv'
label_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/dev/dev.label.txt'

In [297]:
feature_extract(data_path, folder_path, save_path)

        influence          role  credibility          list  \
row                                                          
0    7.004500e+04  1.986529e+01     0.000000   1970.000000   
1    6.140000e+02  1.370000e+12     0.000000      4.500000   
2    6.010000e+03  2.871476e+00     0.000000     60.000000   
3    1.878000e+03  7.347418e-01     0.000000     56.000000   
4    2.123004e+06  2.757159e+03     1.564103  14812.358974   
..            ...           ...          ...           ...   
627  2.240000e+02  2.589595e-01     0.000000      2.000000   
628  2.957238e+06  3.248998e+02     1.500000  16152.000000   
629  3.140000e+02  3.512304e-01     0.000000      0.000000   
630  1.308585e+05  6.539265e+03     0.000000   1061.000000   
631  9.082800e+03  4.467683e+00     0.000000     61.200000   

     profile_description  profile_url          age  \
row                                                  
0               2.000000     2.000000  8004.000000   
1               2.000000     1.

In [298]:
append_label(label_path, save_path, save_path_label)

In [302]:
# test
    
data_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/test/test.data.txt'
folder_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/test/event-objects-test'
save_path = '/Users/weimin/Downloads/NLP/project-data/data_obj/test/test.csv'
save_path_label = '/Users/weimin/Downloads/NLP/project-data/data_obj/test/test_label.csv'

In [303]:
feature_extract(data_path, folder_path, save_path)

        influence         role  credibility          list  \
row                                                         
0    2.351750e+05    33.989707     1.000000   2107.000000   
1    1.062000e+03     1.088115     0.000000     20.000000   
2    2.060200e+04   403.960784     0.000000     94.000000   
3    2.780000e+02     3.159091     0.000000      2.000000   
4    5.113333e+02     0.837289     0.000000      5.333333   
..            ...          ...          ...           ...   
553  1.527027e+07  9188.529639     1.000000  95290.000000   
554  6.249551e+05   307.448962     1.016129   1482.903226   
555  2.402017e+05   198.949155     1.000000   2425.000000   
556  1.384616e+07  7971.413332     1.500000  50188.500000   
557  2.976000e+03     2.946535     0.000000      4.000000   

     profile_description  profile_url          age  \
row                                                  
0                    2.0     1.000000  4487.000000   
1                    2.0     0.000000  6595.