## Oppgave 1:

In [50]:
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize, TweetTokenizer
import re
import string
import random

In [2]:
download_list = ["twitter_samples", "punkt", "wordnet", "averaged_perceptron_tagger", "stopwords"]

for item in download_list:
    if not nltk.download(item, quiet=True):
        print(f"Download of {item} failed!")

In [3]:
stop_words = stopwords.words('english')

In [4]:
print(type(twitter_samples), twitter_samples.__doc__)
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# Sjekk ut NLTK TwitterTokenizer

<class 'nltk.corpus.util.LazyCorpusLoader'> 
    To see the API documentation for this lazily loaded corpus, first
    run corpus.ensure_loaded(), and then run help(this_corpus).

    LazyCorpusLoader is a proxy object which is used to stand in for a
    corpus object before the corpus is loaded.  This allows NLTK to
    create an object for each corpus, but defer the costs associated
    with loading those corpora until the first time that they're
    actually accessed.

    The first time this object is accessed in any way, it will load
    the corresponding corpus, and transform itself into that corpus
    (by modifying its own ``__class__`` and ``__dict__`` attributes).

    If the corpus can not be found, then accessing this object will
    raise an exception, displaying installation instructions for the
    NLTK data package.  Once they've properly installed the data
    package (or modified ``nltk.data.path`` to point to its location),
    they can then use the corpus object wit

In [5]:
print(tweet_tokens[0], positive_tweets[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'] #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [6]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [7]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

print(" ".join(remove_noise(tweet_tokens[0], stop_words=stop_words)))
print(" ".join(remove_noise(tweet_tokens[0])))

#followfriday top engage member community week :)
#followfriday for be top engage member in my community this week :)


In [8]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in negative_tweet_tokens]

In [9]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [10]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [11]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [12]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [13]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.996
Most Informative Features
                      :( = True           Negati : Positi =   2067.8 : 1.0
                      :) = True           Positi : Negati =   1642.1 : 1.0
                follower = True           Positi : Negati =     37.2 : 1.0
                     sad = True           Negati : Positi =     24.4 : 1.0
                followed = True           Negati : Positi =     23.9 : 1.0
                     bam = True           Positi : Negati =     22.1 : 1.0
                     x15 = True           Negati : Positi =     15.8 : 1.0
              appreciate = True           Positi : Negati =     15.5 : 1.0
                     ugh = True           Negati : Positi =     14.5 : 1.0
                 welcome = True           Positi : Negati =     14.0 : 1.0
None


In [14]:
custom_tweet = "I ordered just once from TerribleCo, they were not bad, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))
print(custom_tokens)

print(classifier.classify(dict([token, True] for token in custom_tokens)))

['i', 'order', 'just', 'once', 'from', 'terribleco', 'they', 'be', 'not', 'bad', 'never', 'use', 'the', 'app', 'again']
Negative


Ting å se på:
Forstå naive bayes, og hvorfor "not happy" blir positivt. Teller den rekkefølge? Er alle ordene uavhengige av hverandre?  
Andre datasett, med andre sentiment enn positiv og negativ  
Inkludere tilfeller der teksten ikke er verken positiv eller negativ

## Utforsking av datasett med kjønn og tweets
https://www.kaggle.com/crowdflower/twitter-user-gender-classification  
Idé: Finne språklige mønster som går igjen hos menn og kvinner, og så se om disse går igjen i det forrige datasettet

In [15]:
import pandas as pd
pd.set_option('display.max_rows', 200)
from itertools import groupby

In [94]:
gender_df = pd.read_csv("../data/gender-classifier-DFE-791531.csv", encoding = "latin1")
relevant_rows = (gender_df["gender:confidence"] >= 0.9) & (gender_df["gender"] != "brand") & (gender_df["gender"] != "unknown")
gender_df = gender_df[relevant_rows][["text", "gender"]]
print(f"Fant {len(gender_df)} tweets.")
display(gender_df.loc[101].text)

Fant 10023 tweets.


'Im weakkkkk_Ù÷â_Ù÷â_Ù÷â_Ù÷â_Ù÷â\nTbh thats the only way to shut down girls who flex'

In [105]:
def convert(s):
    try:
        return s.group(0).encode('latin1').decode('utf8')
    except:
        return s.group(0)

In [106]:
def remove_noise2(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub('http','', token)
        token = re.sub('\/\/t\.co.+','', token)
        token = re.sub(r'[\x80-\xFF]+', convert, token) # Attempt to remove \x89 and such
        # Remove words containing special characters, many likely caused by faulty encoding
        token = re.sub(".*[\÷ùûüäôî\.\'\`].*",'', token)
        # token = re.sub("\\\d+",'', token) # Attempt to remove \x89 and such
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [107]:
twitter_tokenizer = TweetTokenizer(preserve_case=True)
gender_tokens = gender_df.apply(lambda row: twitter_tokenizer.tokenize(row["text"]), axis=1)
gender_cleaned_tokens = [remove_noise2(tokens, stop_words) for tokens in gender_tokens]
all_words = get_all_words(gender_cleaned_tokens)
freq_dist_pos = FreqDist(all_words)
print(freq_dist_pos.most_common(100))

[('_ù', 1129), ('\x89', 1082), ('get', 952), ('go', 687), ('like', 663), ('\x8f', 564), ('make', 516), ('one', 496), ('day', 448), ('love', 429), ('time', 421), ('good', 392), ('people', 351), ('know', 348), ('see', 343), ('say', 339), ('want', 327), ('think', 324), ('new', 315), ('look', 301), ('__ù', 293), ('û_', 284), ('best', 280), ('â_ù', 274), ('need', 272), ('come', 268), ('back', 267), ('â', 263), ('take', 261), ('\x95', 256), ('thing', 243), ('work', 243), ('still', 240), ('last', 228), ('year', 218), ('\x9d', 211), ('life', 205), ('u', 203), ('great', 203), ('lol', 200), ('would', 200), ('really', 197), ('2', 192), ('way', 192), ('fuck', 190), ('\x8d', 187), ('try', 187), ('watch', 186), ('even', 184), ('today', 182), ('follow', 176), ('right', 175), ('tell', 175), ('give', 173), ('never', 166), ('world', 165), ('shit', 162), ('much', 161), ('game', 158), ('always', 157), ('via', 154), ('first', 153), ('ever', 153), ('find', 152), ('use', 151), ('im', 150), ('girl', 149), ('w

**TODO: Need to make a tokenizer that treat these two datasets equivalently. The gendered set includes "http" for example.**

For å finne mønster vil vi sjekke de enkeltordene, og gruppene på to og tre ord, hvor det er størst forskjell på bruken hos kvinner og menn, og hvor ordene totalt er brukt nok til at vi kan tro på at det er sannsynlig at dette er et reelt mønster.

Mer konkret, må vi altså implementere følgende:  
~~1) En tokenizer og wordnet-greie som lager tokens.~~  
~~2) En funksjon som henter ut grupper på 1, 2 og 3 ord, sortert etter frekvens.~~  
3) Hente ut de ordene eller ordgruppene som er brukt mer enn for eksempel 50 ganger totalt.  
4) Dele datasettet inn i menn og kvinner, og gjøre ei vurdering på hvor sikker vi her må være på kjønn (0.9 ser bra ut)  
5) Sjekke hvor ofte hver av ordgruppene forekommer hos kvinner og menn  
6) Sortere ordgruppene etter den betingede sannsynligheten for at noen er kvinne gitt at de har brukt denne ordgruppen  
  
7) Manuelt søke etter ordgrupper som brukes mye av menn eller kvinner, men som har et synonym hos det andre kjønnet  
8) Manuelt konstruere setninger som bruker disse ordene, og se om vår sentimentalgoritme gir disse rent kjønnede ordene forskjellig positivitets-verdi  

9) Sammenligne den relative frekvensen av ordgruppene hos menn, kvinner, og positive og negative sentimenter  

Fjerne ordgrupper som brukes mye, men av veldig få.

Her ser vi altså ikke på mer avanserte mønster, som om setningsoppbyggingen er forskjellig.

In [77]:
def word_groups(data, length):
    """Groups words in lists of length == length
    
    :param data: Nested list of strings. Inner dimension is for sentences.
    :param length: Int.
    
    :return: Dictionary with all unique groups, string : int number of occurences.
    """
    word_groups = [" ".join([sentence[i+ii] if i + ii < len(sentence) else "" for ii in range(length)]) for sentence in data for i in range(len(sentence))]
    word_groups.sort()
    word_groups = {key: len(list(group)) for key, group in groupby(word_groups)}
    word_groups = {k: v for k, v in sorted(word_groups.items(), key=lambda item: item[1], reverse=True)}
    return word_groups

In [108]:
word_groups(gender_cleaned_tokens, 4)

{'\x89 û_  ': 243,
 'û_   ': 243,
 'â   ': 196,
 '\x8f   ': 177,
 '__ù __ù __ù __ù': 149,
 'via   ': 124,
 '\x89 \x9d \x95 \x8f': 117,
 'lol   ': 103,
 '_ù   ': 92,
 '_ù â  ': 88,
 '\x95 \x8f  ': 81,
 'â_ù â  ': 78,
 '\x89 ¼ \x95 \x8f': 67,
 'time   ': 65,
 'day   ': 60,
 '#pushawardslizquens   ': 55,
 ':)   ': 55,
 'everydayiloveyou forevermore make last': 51,
 'forevermore make last #pushawardslizquens': 51,
 'last #pushawardslizquens  ': 51,
 'make last #pushawardslizquens ': 51,
 '\x95 \x8f \x89 \x9d': 49,
 '_ù \x8f  ': 48,
 '\x8d   ': 48,
 'â_ù â_ù â_ù â_ù': 46,
 'good   ': 44,
 '\x8f \x89 \x9d \x95': 42,
 '\x8f \x89 ¼ \x95': 42,
 '#artistoftheyear go vote favorite': 41,
 '\x95 \x8f \x89 ¼': 41,
 '\x9d \x95 \x8f ': 41,
 '¼ \x95 \x8f \x89': 41,
 '\x9d \x95 \x8f \x89': 40,
 'â_ù â_ù â ': 40,
 '__ù   ': 39,
 'go   ': 38,
 'voted #artistoftheyear go vote': 38,
 'one   ': 37,
 'right   ': 37,
 'love   ': 36,
 '\x81   ': 36,
 '¥   ': 36,
 'life   ': 34,
 '©   ': 34,
 'õ   ': 34,
 'build