In [None]:
'''
Sentiment analysis of the Welsh language using Word2Vec as implemented in Gensim
NOTE:  This script relies heavily on "MODERN METHODS FOR SENTIMENT ANALYSIS" by Michael Czerny (with modifications)
Czerny's tutorial is available at https://www.districtdatalabs.com/modern-methods-for-sentiment-analysis

I found the pre-trained Welsh word2vec encoding at https://drive.google.com/drive/folders/1iGqzFlZifSeHzPhnz3qM68a37Ul7S7Xq
(Unfortunately, I cannot locate the page where found this link, but it was via Cardiff University: https://www.cardiff.ac.uk/)

The accuracy percentage reported is from a heldback test set. I let it run for a few hours on the provided testing set, 
but it did not finish. Even on the test/train split dataset, the accuracy is poor (68%). The approach needs
more attention than the current deadline allows. 
'''

from gensim.models.keyedvectors import KeyedVectors
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import csv
import random
import math
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import scale

def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += welsh_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

#Do some very minor text preprocessing
def cleanText(corpus):
    corpus = [z.lower().replace('\n','').split() for z in corpus]
    return corpus

# Load data.  
word_vectors = KeyedVectors.load_word2vec_format('welsh.vec')

"""
Experimenting ...  checking relationships between words
>>> word_vectors.most_similar('car')
[('cerbyd', 0.8214571475982666), ('beic', 0.7958533763885498), ('gar', 0.7678565979003906), ('lori', 0.7664268612861633), 
('tryc', 0.7579614520072937), ('char', 0.7528360486030579), ('gerbyd', 0.7441043853759766), ('tractor', 0.7217353582382202), 
('trên', 0.7176547050476074), ('moped', 0.7080420851707458)]
    According to Google translate:
    cerbyd -> vehicle
    beic -> bike
    gar - > a car
    lori -> a truck
    tryc -> a truck
    char -> a car
    gerbyd -> vehicle
    tractor -> tractor
    trên -> train
    moped -> moped

I am not sure whether the fact that some of the Google translations include the indefinite article others do not reflects
an aspect of the Welsh language or a Google quirk.

The documentation indicates that this method preserves/discovers nontrivial grammatic relationships.
i.e., "ate" - "eat" + "speak" = "spoke".  Unfortunately, I don't know enough about Welsh (yet!) to investigate this here.
"""

# Specifying utf8 appears to be required, as the import generated an encoding error message otherwise.
with open('train-v2.tsv', 'r', encoding="utf8") as infile:
    tweets = infile.readlines()

# Import preprocessing    
pos_tweets=[]
neg_tweets=[]
for tweet in tweets:
    if tweet.startswith("1\t"):
        pos_tweets.append(tweet.strip().replace("1\t",''))
    elif tweet.startswith('0\t'):
        neg_tweets.append(tweet.strip().replace('0\t',''))
    else:
        print('Input error: ', tweet)

# Label vector
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

# Test/train split (80/20)
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)

# Ensure all is lowercase and remove any newlines (\n)
x_train = cleanText(x_train)
x_test = cleanText(x_test)
n_dim = 300

# Build vocabulary
welsh_w2v = Word2Vec(vector_size=n_dim, min_count=10)
welsh_w2v.build_vocab(x_train)

# Train the model
welsh_w2v.train(x_train, total_examples=welsh_w2v.corpus_count, epochs=20)
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)

# word2vec with test tweets
welsh_w2v.train(x_test, total_examples=welsh_w2v.corpus_count, epochs=20)

# Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)

print('Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
# 68%