In [37]:
import pandas as pd
import numpy as np

# Helper functions
def distance(tweet1, tweet2):
    diff = 0
    for i in range(len(tweet1)):
        diff += (tweet1[i] - tweet2[i]) ** 2
    distance = diff ** 0.5
    return distance


def min_max_normalize(lst):
    minimum = min(lst)
    maximum = max(lst)
    normalized = []
  
    for value in lst:
        normalized_num = (value - minimum) / (maximum - minimum)
        normalized.append(normalized_num)
  
    return normalized


def scale(lst):
    mean = lst.mean()
    std = lst.std()
    scaled = []
  
    for value in lst:
        value -= mean
        value /= std
        scaled.append(value)
    return scaled


def classify(unknown, dataset, labels, k):
    distances = []
    #Looping through all points in the dataset
    for i in range(len(dataset)):
        distance_to_point = distance(dataset[i], unknown)
        distances.append([distance_to_point, i])
    distances.sort()
    #Taking only the k closest points
    neighbors = distances[0:k]
    num_good = 0
    num_bad = 0
    for neighbor in neighbors:
        i = neighbor[1]
        if labels[i] == 0:
            num_bad += 1
        elif labels[i] == 1:
            num_good += 1
    if num_good > num_bad:
        return 1
    else:
        return 0
    

# Load data
all_tweets = pd.read_json("data/random_tweets.json", lines=True)


# Clean data
median_retweets = all_tweets['retweet_count'].median()
all_tweets['is_viral'] = np.where(all_tweets['retweet_count'] >= median_retweets, 1, 0)

all_tweets['tweet_length'] = all_tweets.apply(lambda tweet: len(tweet['text']), axis=1)
all_tweets['followers_count'] = all_tweets.apply(lambda tweet: tweet['user']['followers_count'], axis=1)
all_tweets['friends_count'] = all_tweets.apply(lambda tweet: tweet['user']['friends_count'], axis=1)


# Normalize
all_tweets['tweet_length'] = scale(all_tweets['tweet_length'])
all_tweets['followers_count'] = scale(all_tweets['followers_count'])
all_tweets['friends_count'] = scale(all_tweets['friends_count'])


train_data = all_tweets[['tweet_length','followers_count','friends_count']]
train_data = train_data.values.tolist()
train_labels = all_tweets['is_viral']


# Classify
test_sample = [0.8, 0.028, 0.1]
guess = classify(test_sample, train_data, train_labels, 5)
print(guess)

       tweet_length  followers_count  friends_count
0          0.616378        -0.028782      -0.144827
1         -1.645702        -0.028861      -0.162091
2          0.616378        -0.028876      -0.115661
3          0.616378        -0.013393       0.108510
4          0.616378        -0.029229      -0.152935
5          0.580472        -0.029214      -0.123770
6          0.616378        -0.028563      -0.161960
7          0.580472        -0.029288      -0.172946
8          0.616378        -0.023889       0.199800
9          0.616378        -0.029596      -0.136456
10         0.616378        -0.029259      -0.180270
11        -2.399729        -0.026323      -0.122593
12         0.616378        -0.028325      -0.067923
13         0.616378        -0.029840      -0.182494
14         0.472754        -0.027744      -0.153589
15         0.616378        -0.029432      -0.178439
16        -1.932950        -0.029432      -0.102713
17        -1.322548        -0.029616      -0.168892
18         0

In [26]:
# Use K-Nearest Neighbor algorithm for regression
def predict(unknown, dataset, ratings, k):
    distances = []
    for i in range(len(dataset)):
        distance_to_point = distance(dataset[i], unknown)
        distances.append([distance_to_point, i])
    distances.sort()
    neighbors = distances[0:k]
    
    numerator = 0
    denominator = 0
    for neighbor in neighbors:
        rating = ratings[neighbor[1]]
        distance_to_neighbor = neighbor[0]
        numerator += rating / distance_to_neighbor
        denominator += 1 / distance_to_neighbor
    
  return numerator / denominator