In [3]:
import numpy as np
import pandas as pd
from stemmer import Stemmer
import re,string

def tweet_cleaner(tweet, stopwords = None, s = None):
    if stopwords is None:
        stopwords = pd.read_csv( '../datasets/stopwords.csv', sep=',', index_col=None, header=None)
        stopwords =  np.concatenate((stopwords.iloc[:,0].values, ['AT_USER', 'URL']))
    if s is None:
        s = Stemmer()
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.translate(str.maketrans(string.punctuation,len(string.punctuation)*' ')) # remove punctuation
    tweet = re.findall(r'\w+', tweet) #tweet.findall(r'\w+') # split words
    tweet = [s.stem(word) for word in tweet] # stemming words
    return [word for word in tweet if word not in stopwords]

def tweets_cleaner(tweets, stopwords = None, s = None):
    cleaned_tweets = []
    if stopwords is None:
        stopwords = pd.read_csv( '../datasets/stopwords.csv', sep=',', index_col=None, header=None)
        stopwords =  np.concatenate((stopwords.iloc[:,0].values, ['AT_USER', 'URL']))
    if s is None:
        s = Stemmer()
    for i in tweets:
        cleaned_tweets.append(tweet_cleaner(i, stopwords, s))
    return cleaned_tweets

def build_vocab(cleaned_tweets):
    vocab = []
    
    for i in cleaned_tweets:
        vocab.extend(i)
        
    vocab = np.sort(list(set(vocab)))
    
    return vocab

def build_features(tweet, vocab):
    features = np.zeros(len(vocab))
    features = np.isin(vocab, tweet).astype(int)
    return features

def build_representation(tweets, vocab):
    data = []
    for i in tweets:
        data.append(build_features(i,vocab))
    return data

In [4]:
cleaned_tweets = tweets_cleaner(["#winning; an #iphone6 can't sweepstakes http://t.co/r7mya2cexo"])
vocab = build_vocab(cleaned_tweets)
print(vocab)

['iphone6' 'sweepstak' 'win']


In [5]:
build_representation(cleaned_tweets, vocab)

[array([1, 1, 1])]

In [6]:
'win; an iphone6 sweepstakes URL'.translate(str.maketrans(string.punctuation,len(string.punctuation)*' ')).split()

['win', 'an', 'iphone6', 'sweepstakes', 'URL']