### Updates
#### (3/26/19)
- Separated urls into links and linked pictures
- Changed racist word count to regular frequency
- Racist/neutral scores are aggregated
- Time tweet created feature removed
- Corrected a few data types in the user-feature dataframe
- Replaced regular variables with np.arrays when calculating metrics to shorten script
- Specify racism dictionary path in cell under "User-defined Variables"

### User-defined Variables

In [4]:
# Specify filenames and directories here
# Specify the directory containing user tweets .json + .gz files
TWEETS_DIRECTORY = "../users-new/"

# Specify the name of the CSV file containing personality scores for each user
TRAIN_LABEL_FILE = "../train_labels_rand.csv"

# Specify the name of the CSV file containing dictionary words related to racism
RACISM_DICT_FILE = "../expanded.csv"

# Specify the name of the CSV file containing dictionary words related to religiosity
RELIGIOUS_DICTIONARY = "../religious_corpus.csv"

### Program Begins Here

In [51]:
# Imports here
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from textstat.textstat import textstatistics, easy_word_set, legacy_round
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dictfeaturizer import DictFeaturizer
from nltk import TweetTokenizer
from textstat.textstat import textstat
import graphviz
import sys
import os
import gzip
import re
import json
import string
import random
import numpy as np
import pandas as pd
import nltk.data
import emoji
import csv

#### Function Definitions

In [6]:
# Function definitions for accessing/parsing twitter/training data
# Get tweets from one user
def getUserTweetsAndData(userTweetsFile):
    data = []
    tweets = []

    with gzip.open(userTweetsFile,'r') as f:        
        for tweetDeetsBinary in f:
            # Convert each line (binary) to string
            tweetDeetsStr = tweetDeetsBinary.decode('utf-8')

            # Generate json objects
            tweetDeetsJson = json.loads(tweetDeetsStr)
            data.append(tweetDeetsJson)            
            tweets.append(tweetDeetsJson["text"])
    
    return data, tweets

# Get a user's ID
def getUserID(TWEETS_DIRECTORY, userTweetsFile):
    pattern = "(" + TWEETS_DIRECTORY.replace("/", "\/") + ")(\d+)(\.json\.gz)"
    m = re.match(pattern, userTweetsFile)
    
    return str(m[2])

# Get a user's SDO and RWA scores
def getUserScores(TWEETS_DIRECTORY, userTweetsFile, df):
    userID = getUserID(TWEETS_DIRECTORY, userTweetsFile)
    sdo_score = float(df.loc[userID, "sdo"])
    rwa_score = float(df.loc[userID, "rwa"])
    
    return sdo_score, rwa_score

# Print a user's data
def printUserStats(TWEETS_DIRECTORY, userTweetsFile, df):
    # Get user personality scores
    personalityScores = getUserScores(TWEETS_DIRECTORY, userTweetsFile, df)
    sdo_score = personalityScores[0]
    rwa_score = personalityScores[1]
    
    # Print user data
    print("Current twitter user file:", userTweetsFile)
    print("Number of tweets:", int(df.loc[userID, "num_tweets"]))
    print("Number of followers:", int(df.loc[userID, "num_followers"]))
    print("Day with the most tweets:", int(df.loc[userID, "day_with_most_tweets"]))
    print("Flesch Kincaid Grade:", float(df.loc[userID, "flesch_kincaid_grade"]))
    print("Coleman Liau Index:", float(df.loc[userID, "coleman_liau_index"]))
    print("Automated Readability Index:", float(df.loc[userID, "automated_readability_index"]))
    print("Linsear Write Formula:", float(df.loc[userID, "linsear_write_formula"]))
    print("Gunning Fog:", float(df.loc[userID, "gunning_fog"]))
    print("Average number of retweets per tweet:", float(df.loc[userID, "avg_num_retweets"]))
    print("Average number of favorites per tweet:", float(df.loc[userID, "avg_num_favorites"]))
    print("Average number of hashtags per tweet:", float(df.loc[userID, "avg_num_hashtags"]))
    print("Average number of emojis per tweet:", float(df.loc[userID, "avg_num_emojis"]))
    print("Average number of links per tweet:", float(df.loc[userID, "avg_num_links"]))
    print("Average number of linked pictures per tweet:", float(df.loc[userID, "avg_num_linked_pics"]))
    print("Percentage of positive tweets:", float(df.loc[userID, "percent_pos_tweets"]))
    print("Percentage of neutral tweets:", float(df.loc[userID, "percent_neu_tweets"]))
    print("Percentage of negative tweets:", float(df.loc[userID, "percent_neg_tweets"]))
    print("Average number of racist words per tweet:", float(df.loc[userID, "avg_racist_score"]))
    print("Average number of neutral words per tweet:", float(df.loc[userID, "avg_neutral_score"]))

    print("SDO Score:", sdo_score)
    print("RWA Score:", rwa_score)
    
    print("---------------------------------------------------------------------")

In [7]:
# Function definitions for calculating metrics for each user
# day_with_most_tweets
def updateDayFreq(daysTweetedCount, data):
    srch = re.search("(\w{3})", data["created_at"])
    day = str(srch[1])
    if day == "Mon":
        daysTweetedCount[0] += 1
    elif day == "Tue":
        daysTweetedCount[1] += 1
    elif day == "Wed":
        daysTweetedCount[2] += 1
    elif day == "Thu":
        daysTweetedCount[3] += 1
    elif day == "Fri":
        daysTweetedCount[4] += 1
    elif day == "Sat":
        daysTweetedCount[5] += 1
    elif day == "Sun":
        daysTweetedCount[6] += 1
    
def getDayWMostTweets(daysTweetedCount):
    return np.argmax(daysTweetedCount)

# Readability scores
def cleanTweet(text):
    cleanedText = " ".join([re.sub("[^a-zA-Z#']", '', x) for x in tweet_tokenizer.tokenize(text) if
                                 x and
                                 not x.startswith('http') and
                                 not x.startswith('@') and
                                 not x.startswith('#') and
                                 x.lower() != 'rt' and
                                 not (x.startswith('&') and x.endswith(';'))]).strip().lower()
    return cleanedText.lower()

# flesch, coleman, automated, linsear, gunning readability scores
def updateReadabilityScores(readabilityScores, cleanedTweet):
    readabilityScores[0] += textstat.flesch_kincaid_grade(cleanedTweet)
    readabilityScores[1] += textstat.coleman_liau_index(cleanedTweet)
    readabilityScores[2] += textstat.automated_readability_index(cleanedTweet)
    readabilityScores[3] += textstat.linsear_write_formula(cleanedTweet)
    readabilityScores[4] += textstat.gunning_fog(cleanedTweet)
    
# avg_num_emojis
def extract_emojis(text):
    return [c for c in text if c in emoji.UNICODE_EMOJI]

# avg_num_retweets, avg_num_favorites, avg_num_hashtags, avg_num_emojis
def updateTweetFeaturesFreq(tweetFeaturesCount, data, tweet):
    tweetFeaturesCount[0] += data["retweet_count"]
    tweetFeaturesCount[1] += data["favorite_count"]
    tweetFeaturesCount[2] += len(data["entities"]["hashtags"])
    tweetFeaturesCount[3] += len(extract_emojis(tweet))
    
# avg_num_links/avg_num_linked_pics
def updateLinksAndPicsFreq(linksAndPicsCount, urls):
    for url in urls:
        m = re.search('https:\/\/twitter\.com\/i\/web\/status\/', url["expanded_url"])
        if m:
            linksAndPicsCount[1] += 1
        else:
            linksAndPicsCount[0] += 1
            
# percent_pos_tweets, percent_neu_tweets, percent_neg_tweets
def updateSentimentFreq(sentimentCount, text):
    sid = SentimentIntensityAnalyzer()
    filteredTweetText = ''.join(filter(lambda x: x in string.printable, text))
    ss = sid.polarity_scores(filteredTweetText)
    compoundScore = ss["compound"]
    if compoundScore > 0:
        sentimentCount[0] += 1
    elif compoundScore == 0:
        sentimentCount[1] += 1
    else:
        sentimentCount[2] += 1
        
# avg_num_racist_words/avg_num_neutral_words
def aggregateRacismResults(racismDictResults):
    numRacistWords = racismDictResults['racist-stereotypes'] + \
                        racismDictResults['racist-skin_color'] + \
                        racismDictResults['racist-culture'] + \
                        racismDictResults['racist-country'] + \
                        racismDictResults['racist-animals'] + \
                        racismDictResults['racist-migration'] + \
                        racismDictResults['racist-nationality'] + \
                        racismDictResults['racist-religion'] + \
                        racismDictResults['racist-crime'] + \
                        racismDictResults['racist-race'] + \
                        racismDictResults['racist-diseases']
    numNeutralWords = racismDictResults['neutral-migration'] + \
                        racismDictResults['neutral-skin_color'] + \
                        racismDictResults['neutral-country'] + \
                        racismDictResults['neutral-religion'] + \
                        racismDictResults['neutral-nationality'] 
    
    return numRacistWords, numNeutralWords


### AUSTIN CODE FUNCTION RELIGIOUSNESS
def read_csv(directory):
    religious_words = []
    with open(directory) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        for row in csvfile:
            if row != 'WORSHIP,\n':
                religious_words.append(row[:-2].lower())
            else:
                religious_words.append(row[:-1].lower())
            
    return religious_words

def is_tweet_religious(tweet,dict):
    needed_words = 2
    common_words = set(dict) & set(tweet.split())
    if len(common_words) >= 2:
        return True
    
def religion_score_test():
    tweets = ["i am not religious", "nope not me either", "holy church", "christians can perform miracles at church"]
    religionDict = read_csv(RELIGIOUS_DICTIONARY)
    
    print(religionDict)
    
    totalNumTweets = len(tweets)
    totalReligiousTweets = 0
    religiousScore = 0.0
    
    for i in range(totalNumTweets):
        #calculate religious score for tweet
        print("Processing Tweet: " + tweets[i])
        if is_tweet_religious(tweets[i],religionDict):
            totalReligiousTweets += 1
    
    #calculations for user
    if totalReligiousTweets and totalNumTweets:
        religiousScore = totalReligiousTweets/totalNumTweets
        
    print (totalReligiousTweets)
    print (religiousScore)


In [8]:
# Function definitions for LDA
# Print the words in their respective topics
def printTopWords(model, featureNames, nTopWords):
    for topicIdx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topicIdx
        message += " ".join([featureNames[i] for i in topic.argsort()[:-nTopWords-1:-1]])
        print(message)
    
    print()

#### Pre-processing

In [9]:
# Get the gzip files
gzipFiles = [TWEETS_DIRECTORY + objname for objname in os.listdir(TWEETS_DIRECTORY) if re.search(r".+\.gz$", objname)]
for gzfilename in gzipFiles:
    print(gzfilename)

../users-new/38381682.json.gz
../users-new/36863478.json.gz
../users-new/36670025.json.gz


In [10]:
# Load training labels and dictionaries + init tokenizer
df = pd.read_csv(TRAIN_LABEL_FILE, dtype={'user_id': 'str'})
racismDict = DictFeaturizer.load(RACISM_DICT_FILE)
racismDict.rel = False
tweet_tokenizer = TweetTokenizer()

In [11]:
# Specify feature columns in dataframe
featureNames = [
    "num_tweets",#ok
    "num_followers",#ok
    "day_with_most_tweets",#ok    
    "flesch_kincaid_grade",#ok
    "coleman_liau_index",#ok
    "automated_readability_index",#ok
    "linsear_write_formula",#ok
    "gunning_fog",#ok
    "avg_num_retweets",#ok
    "avg_num_favorites",#ok
    "avg_num_hashtags",#ok
    "avg_num_emojis",#ok
    "avg_num_links",#ok
    "avg_num_linked_pics",#ok
    "percent_pos_tweets",#ok
    "percent_neu_tweets",#ok
    "percent_neg_tweets",#ok
    "avg_racist_score",#ok
    "avg_neutral_score",#ok
    "avg_sexist_score",#Habeeb
    "avg_relig_score"#Austin
]

In [12]:
# Init/reset all fields to 0.0
for featName in featureNames:
    df[featName] = np.zeros(df.shape[0])

In [13]:
# Correct the data types in the dataframe + set df index
df = df.astype({"num_tweets": int,
                "num_followers": int,
                "num_followers": int,
                "day_with_most_tweets": int
               })
df = df.set_index("user_id")
print(df.head(3))

                sdo       rwa  num_tweets  num_followers  \
user_id                                                    
12432922   0.585655  0.175252           0              0   
172018245  0.808501  0.567951           0              0   
929862361  0.083294  0.587029           0              0   

           day_with_most_tweets  flesch_kincaid_grade  coleman_liau_index  \
user_id                                                                     
12432922                      0                   0.0                 0.0   
172018245                     0                   0.0                 0.0   
929862361                     0                   0.0                 0.0   

           automated_readability_index  linsear_write_formula  gunning_fog  \
user_id                                                                      
12432922                           0.0                    0.0          0.0   
172018245                          0.0                    0.0          0.0   
9

In [14]:
# Calculate metrics for each user
for userTweetsFile in gzipFiles:
    # Get data and tweets
    userID = getUserID(TWEETS_DIRECTORY, userTweetsFile)
    dataAndTweets = getUserTweetsAndData(userTweetsFile)
    data = dataAndTweets[0]
    tweets = dataAndTweets[1]
    totalNumTweets = len(data)
    daysTweetedCount = np.zeros(7)   # [mon, tue, ..., sun]
    sentimentCount = np.zeros(3)     # [#pos, #neu, #neg]
    linksAndPicsCount = np.zeros(2)  # [#links, #pics]
    readabilityScores = np.zeros(5)  # [flesch, coleman, automated, linsear, gunning]
    tweetFeaturesCount = np.zeros(4) # [#retweets, #favorites, #hashtags, #emojis]
    totalReligiousTweets = 0 #Amount of religious tweets for a user
    religiousScore = 0.0 #user religious socre
    
    religionDict = read_csv(RELIGIOUS_DICTIONARY) #reads in religion dict may need to change pos for clarity
    
    # Look through each tweet
    for i in range(totalNumTweets):
        
        # Get the day of the week tweeted
        updateDayFreq(daysTweetedCount, data[i])

        # Calculate average readability scores
        cleanedTweet = cleanTweet(tweets[i])
        updateReadabilityScores(readabilityScores, cleanedTweet)
        
        # Get the number retweeted/favorited/hashtags/emojis
        updateTweetFeaturesFreq(tweetFeaturesCount, data[i], tweets[i])
        
        # Get the number of links
        updateLinksAndPicsFreq(linksAndPicsCount, data[i]["entities"]["urls"])
        
        # Check if tweet is positive, negative, or neutral
        updateSentimentFreq(sentimentCount, cleanedTweet)
        
        # Count the number of racist/neutral words used
        racismDictResults = racismDict.transform(cleanedTweet.split())
        aggregatedRacismResults = aggregateRacismResults(racismDictResults)
        df.loc[userID, "avg_racist_score"] += aggregatedRacismResults[0]
        df.loc[userID, "avg_neutral_score"] += aggregatedRacismResults[1]
        
        if is_tweet_religious(cleanedTweet,religionDict):
            totalReligiousTweets += 1
        
    # User-level metrics
    # Get the number of tweets total/number of followers
    df.loc[userID, "num_tweets"] = totalNumTweets
    df.loc[userID, "num_followers"] = data[i]["user"]["followers_count"]
    
    # Get the day that on which the user tweeted most
    df.loc[userID, "day_with_most_tweets"] = getDayWMostTweets(daysTweetedCount)
    
    # Get the average readability scores for a user
    readabilityScores /= totalNumTweets
    df.loc[userID, "flesch_kincaid_grade"] = readabilityScores[0]
    df.loc[userID, "coleman_liau_index"] = readabilityScores[1]
    df.loc[userID, "automated_readability_index"] = readabilityScores[2]
    df.loc[userID, "linsear_write_formula"] = readabilityScores[3]
    df.loc[userID, "gunning_fog"] = readabilityScores[4]
    
    # Get the average number of retweets/favorites/hashtags/emojis per tweet
    tweetFeaturesCount /= totalNumTweets
    df.loc[userID, "avg_num_retweets"] = tweetFeaturesCount[0]
    df.loc[userID, "avg_num_favorites"] = tweetFeaturesCount[1]
    df.loc[userID, "avg_num_hashtags"] = tweetFeaturesCount[2]
    df.loc[userID, "avg_num_emojis"] = tweetFeaturesCount[3]
    
    # Get the average number of links/linked pictures per tweet
    linksAndPicsCount /= totalNumTweets
    df.loc[userID, "avg_num_links"] = linksAndPicsCount[0]
    df.loc[userID, "avg_num_linked_pics"] = linksAndPicsCount[1]
    
    # Get the percentage of pos/neu/neg tweets
    sentimentCount /= totalNumTweets
    df.loc[userID, "percent_pos_tweets"] = sentimentCount[0]
    df.loc[userID, "percent_neu_tweets"] = sentimentCount[1]
    df.loc[userID, "percent_neg_tweets"] = sentimentCount[2]
    
    # Get the average number of racist/neutral words per tweet
    df.loc[userID, "avg_racist_score"] /= totalNumTweets
    df.loc[userID, "avg_neutral_score"] /= totalNumTweets
    
    #Get religious score of user
    df.loc[userID, "avg_religous_score"] = totalReligiousTweets / totalNumTweets
    
    # Print all a user's data
    printUserStats(TWEETS_DIRECTORY, userTweetsFile, df)

Current twitter user file: ../users-new/38381682.json.gz
Number of tweets: 420
Number of followers: 2979
Day with the most tweets: 5
Flesch Kincaid Grade: 6.575952380952383
Coleman Liau Index: 6.223285714285711
Automated Readability Index: 6.4921428571428565
Linsear Write Formula: 6.916666666666667
Gunning Fog: 7.9156428571428705
Average number of retweets per tweet: 18.873809523809523
Average number of favorites per tweet: 0.6428571428571429
Average number of hashtags per tweet: 0.6857142857142857
Average number of emojis per tweet: 0.01904761904761905
Average number of links per tweet: 0.29523809523809524
Average number of linked pictures per tweet: 0.05
Percentage of positive tweets: 0.4023809523809524
Percentage of neutral tweets: 0.3976190476190476
Percentage of negative tweets: 0.2
Average number of racist words per tweet: 0.319047619047619
Average number of neutral words per tweet: 0.1976190476190476
SDO Score: 0.40267886799999997
RWA Score: 0.977335215
-------------------------

In [15]:
# Get complete rows of data from DF and convert to np_array
colNames = ["user_id", "sdo", "rwa"] + featureNames
newDF = pd.DataFrame(columns=colNames)
newDF = newDF.set_index("user_id")

for userTweetsFile in gzipFiles:
    userID = getUserID(TWEETS_DIRECTORY, userTweetsFile)
    newDF = newDF.append(df.loc[userID])
    
npDF = newDF.values
print(npDF)

[[4.02678868e-01 9.77335215e-01 4.20000000e+02 2.97900000e+03
  5.00000000e+00 6.57595238e+00 6.22328571e+00 6.49214286e+00
  6.91666667e+00 7.91564286e+00 1.88738095e+01 6.42857143e-01
  6.85714286e-01 1.90476190e-02 2.95238095e-01 5.00000000e-02
  4.02380952e-01 3.97619048e-01 2.00000000e-01 3.19047619e-01
  1.97619048e-01 0.00000000e+00 0.00000000e+00 9.52380952e-03]
 [3.80268166e-01 2.51323511e-01 3.23400000e+03 1.03610000e+04
  5.00000000e+00 8.53209647e+00 7.41035250e+00 7.79140383e+00
  8.97031540e+00 9.47187384e+00 2.45289734e+02 1.36054422e-01
  1.52752010e-01 8.65800866e-03 1.20593692e-01 6.89857761e-01
  2.64687693e-01 2.50463822e-01 4.84848485e-01 3.20964750e-01
  1.28014842e-01 0.00000000e+00 0.00000000e+00 5.84415584e-02]
 [2.67173800e-02 3.22228218e-01 3.21700000e+03 2.36550200e+06
  3.00000000e+00 7.01000933e+00 7.90016164e+00 8.24140504e+00
  7.07133976e+00 8.46105689e+00 1.72339447e+02 2.02535903e+02
  1.80603046e-01 3.87006528e-01 3.78924464e-01 3.37270749e-01
  3.11

In [16]:
# Split the data into training/testing sets
trainSize = int(.75*npDF.shape[0])
Xtrain = npDF[:trainSize, 2:]
ytrain = npDF[:trainSize, :2]
Xtest = npDF[trainSize:, 2:]
ytest = npDF[trainSize:, :2]

print(Xtrain.shape)
print(ytrain.shape)
print(Xtest.shape)
print(ytest.shape)

(2, 22)
(2, 2)
(1, 22)
(1, 2)


#### Regression

In [17]:
# Run Linear Regression
regr = linear_model.LinearRegression()
regr.fit(Xtrain, ytrain)
yPred = regr.predict(Xtest)
print(yPred)

[[  -5.85731589 -201.81994718]]


In [18]:
print('Coefficients: \n', regr.coef_)
print("Mean squared error: %.2f" % mean_squared_error(ytest, yPred))
print('Variance score: %.2f' % r2_score(ytest, yPred))

Coefficients: 
 [[-1.00960423e-06 -2.64850690e-06  0.00000000e+00 -7.01823507e-10
  -4.25894688e-10 -4.66147611e-10 -7.36806128e-10 -5.58343064e-10
  -8.12332888e-08  1.81830195e-10  1.91215696e-10  3.72757448e-12
   6.26587519e-11 -2.29567556e-10  4.94014558e-11  5.27962111e-11
  -1.02197667e-10 -6.87826243e-13  2.49725302e-11  0.00000000e+00
   0.00000000e+00 -1.75506632e-11]
 [-3.27068954e-05 -8.58003916e-05  0.00000000e+00 -2.27361053e-08
  -1.37971817e-08 -1.51012057e-08 -2.38693938e-08 -1.80879474e-08
  -2.63161406e-06  5.89052722e-09  6.19457761e-09  1.20757605e-10
   2.02987784e-09 -7.43701523e-09  1.60039766e-09  1.71037334e-09
  -3.31077100e-09 -2.22826533e-11  8.09004073e-10  0.00000000e+00
   0.00000000e+00 -5.68567057e-10]]
Mean squared error: 20448.04
Variance score: 0.00


### Don't run past here, still broken from multiple changes

In [19]:
# Feature extraction (old)

vectorizer = CountVectorizer(stop_words='english')
tf = vectorizer.fit_transform(tweets)
print("tf shape:", tf.shape)

y = np.empty(tf.shape[0])

# Give each tweet the personality score of the user
if TRAIN_LABEL_FILE == "":
    # Randomly fill score if none provided
    #y.fill(np.random.uniform(size=1)[0])
    y = np.random.uniform(size=tf.shape[0])
    print(y[0])
else:
    #y.fill(float(sdo_score))
    y = np.random.uniform(size=tf.shape[0])
    print(y[0])


tf shape: (3217, 11337)
0.08385362297850163


In [None]:
# Feature selection (old)
"""
#tfNew = SelectKBest(f_regression, k=10).fit_transform(tf, y)
selPercent = SelectPercentile(f_regression, percentile=10)
tfNew = selPercent.fit_transform(tf, y)
print("tfNew shape:", tfNew.shape)
"""

#### LDA

In [20]:
# Run LDA with original tf

lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)
print("Topics in LDA model:")
tfFeatureNames = vectorizer.get_feature_names()
printTopWords(lda, tfFeatureNames, 20)


Topics in LDA model:
Topic #0: tomi_adeyemi household frfe3m9az3 sounds wait earth dreams samuel adeyemimichael turns regional rock idea hope eto happiness perfect eliudkipchoge dangers congratulations
Topic #1: injury newcastle undergoes squad sangare zmq99ob4th moise_katumbi nomination woxpknkgiw mygjsbwvex boat museveni spain tragedy toure turn ghanaians draw vincent coding
Topic #2: rwanda 16 genocide rwandan shock kenyans force thomas rwigara diane open athlete plane malawi known later weekend albinism bans businesses
Topic #3: https rt africa nigeria south president year women says world people congo new kenya bbc dr election 2018 uganda cameroon
Topic #4: thanks comments saxophone favourite virtuoso osei wr59aam8fw teddy sharing watching prevent franklin aretha georgiendirangu comedian game yaolri ministry poaching investigation
Topic #5: https joins rt sierra leone forward minister nigerian big mosalah music yes loan burundi bbcworldservice boko haram nyom 6p3fyfxxor schalke
To

In [21]:
# Run LDA with selected tf
'''
lda2 = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda2.fit(tfNew)
print("Topics in new LDA model:")
tfFeatureNames = vectorizer.get_feature_names()
retained = selPercent.get_support(True)
newFeatureNames = []

# Get the list of selected feature names 
for idx in retained:
    newFeatureNames.append(tfFeatureNames[idx])

printTopWords(lda2, newFeatureNames, 20)
'''

NameError: name 'tfNew' is not defined

# References

https://www.geeksforgeeks.org/readability-index-pythonnlp/
- Referenced code for calculating readability score
- There are 4 readability formulas we can use
    - flesch_reading_ease(text)
    - gunning_fog(text)
    - smog_index(text)
    - dale_chall_readability_score(text)

@article{tulkens2016automated,
  title={The automated detection of racist discourse in dutch social media},
  author={Tulkens, St{\'e}phan and Hilte, Lisa and Lodewyckx, Elise and Verhoeven, Ben and Daelemans, Walter},
  journal={Computational Linguistics in the Netherlands Journal},
  volume={6},
  number={1},
  pages={3--20},
  year={2016}
}
- Racism dictionary

https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text
- Code used to extract emojis

Machine Learning
SDO 0 
RWA 1

In [62]:
print (npDF [0:,2:])
print(npDF [0:,:2])
print (y)

[[4.20000000e+02 2.97900000e+03 5.00000000e+00 6.57595238e+00
  6.22328571e+00 6.49214286e+00 6.91666667e+00 7.91564286e+00
  1.88738095e+01 6.42857143e-01 6.85714286e-01 1.90476190e-02
  2.95238095e-01 5.00000000e-02 4.02380952e-01 3.97619048e-01
  2.00000000e-01 3.19047619e-01 1.97619048e-01 0.00000000e+00
  0.00000000e+00 9.52380952e-03]
 [3.23400000e+03 1.03610000e+04 5.00000000e+00 8.53209647e+00
  7.41035250e+00 7.79140383e+00 8.97031540e+00 9.47187384e+00
  2.45289734e+02 1.36054422e-01 1.52752010e-01 8.65800866e-03
  1.20593692e-01 6.89857761e-01 2.64687693e-01 2.50463822e-01
  4.84848485e-01 3.20964750e-01 1.28014842e-01 0.00000000e+00
  0.00000000e+00 5.84415584e-02]
 [3.21700000e+03 2.36550200e+06 3.00000000e+00 7.01000933e+00
  7.90016164e+00 8.24140504e+00 7.07133976e+00 8.46105689e+00
  1.72339447e+02 2.02535903e+02 1.80603046e-01 3.87006528e-01
  3.78924464e-01 3.37270749e-01 3.11470314e-01 3.96021138e-01
  2.92508548e-01 4.00994716e-01 3.13957103e-01 0.00000000e+00
  0.

In [63]:
clf = tree.DecisionTreeRegressor()
#clf = clf.fit(Xtrain,ytrain)
scores = cross_val_score(clf, Xtrain, ytrain, cv=2)
print(scores)


[0. 0.]


In [64]:
clf = RandomForestRegressor()
#clf = clf.fit(Xtrain,ytrain)
scores = cross_val_score(clf, Xtrain, ytrain, cv=2)
print(scores)

[0. 0.]




In [65]:
clf = linear_model.Ridge(alpha = 1.0)
#clf = clf.fit(Xtrain,ytrain)
scores = cross_val_score(clf, Xtrain, ytrain, cv=2)
print(scores)

[0. 0.]


In [58]:
clf = linear_model.Lars(n_nonzero_coefs=1)
clf = clf.fit(Xtrain,ytrain)
#scores = cross_val_score(clf, Xtrain, y, cv=2)
#print(scores)
print(clf.coef_) 

[[-7.96400213e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.57999895e-04  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [68]:
clf_SDO = linear_model.BayesianRidge(compute_score=True)
clf_SDO = clf.fit(Xtrain,ytrain[0,])
#scores = cross_val_score(clf, Xtrain, y, cv=2)
#print(scores)
print(clf.coef_) 

clf_RWA = linear_model.BayesianRidge(compute_score=True)
clf_RWA = clf.fit(Xtrain,ytrain[1,])
#scores = cross_val_score(clf, Xtrain, y, cv=2)
#print(scores)
print(clf.coef_) 

[ 2.58883216e-05  6.79131449e-05  0.00000000e+00  1.79961931e-08
  1.09208126e-08  1.19529801e-08  1.88932192e-08  1.43170605e-08
  2.08298801e-06 -4.66249887e-09 -4.90316232e-09 -9.55826493e-11
 -1.60669882e-09  5.88658262e-09 -1.26675457e-09 -1.35380306e-09
  2.62055763e-09  1.76372746e-11 -6.40346855e-10  0.00000000e+00
  0.00000000e+00  4.50034973e-10]
[-5.80896862e-06 -1.52387372e-05  0.00000000e+00 -4.03808801e-09
 -2.45047396e-09 -2.68207756e-09 -4.23936782e-09 -3.21254334e-09
 -4.67392680e-07  1.04619798e-09  1.10019941e-09  2.14473777e-11
  3.60520206e-10 -1.32086484e-09  2.84241586e-10  3.03774019e-10
 -5.88015605e-10 -3.95755183e-12  1.43684664e-10  0.00000000e+00
  0.00000000e+00 -1.00981403e-10]


In [70]:
clf_SDO = linear_model.LogisticRegression()
clf_SDO = clf.fit(Xtrain,ytrain[0])
scores = cross_val_score(clf, Xtrain, ytrain, cv=2)
print(scores)
print(clf.coef_) 

clf_RWA = linear_model.LogisticRegression()
clf_RWA = clf.fit(Xtrain,ytrain[1])
scores = cross_val_score(clf, Xtrain, ytrain, cv=2)
print(scores)
print(clf.coef_) 



ValueError: Unknown label type: 'continuous'