In [None]:

#check the data size and distribution of classes:
def get_data_stats(X, y):
    print(f"Size of dataset: {len(X)}")
    unique, counts = np.unique(y, return_counts=True)
    print(f"Distribution of classes: {dict(zip(unique, counts))}")

get_data_stats(X, y)

In [None]:
# train dev test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5)
print(f"Train size: {len(X_train)}\nDev size: {len(X_dev)}\nTest size: {len(X_test)}")

In [None]:
##BLU07 LN2

In [None]:
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [None]:
def preprocess(doc):
    # remove html tags
    doc = re.sub("<[^>]*>", "", doc)
    # lowercase
    doc = doc.lower()
    # tokenize
    words = tokenizer.tokenize(doc)
    # remove punctuation
    words = [word for word in words if word not in string.punctuation]
    # stem
    stems = [stemmer.stem(word) for word in words]
    new_doc = " ".join(stems)
    return new_doc

In [None]:
docs = docs.apply(preprocess)

In [None]:
#BLU07 LN3

In [None]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

In [None]:
# Initialize a tokenizer and a stemmer
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
regex_list = [("<[^>]*>", "")
             ]

cleaner = TextCleanerTransformer(tokenizer, stemmer, regex_list)
docs = cleaner.transform(train_df.text.values)

In [None]:
#BLU08 Exercise

In [None]:
def train_model_pca_svm(X_train, y_train, X_test, y_test, num_features=100, seed=42):
    """Returns a fitted TfidfVectorizer, the truncated svd used, a support vector classifier
    and the test predictions computed with these
    
    Parameters:
        X_train (Series): Text data for training
        y_train (Series): Labels corresponding to X_train
        X_test (Series): Text data for testing
        y_test (Series): Labels corresponding to X_test
        num_features (int): maximum number of features to use
        seed (int): Seed to use for random state

    Returns:
        vectorizer (CountVectorizer): CountVectorizer, fitted to X_train
        pca (PCA): PCA with provided number of features as components
        clf (SVC): SVC classifier fitted to the feature-selected training data
        y_pred (Series): The predictions computed with our classifier
    """
    vectorizer = CountVectorizer(max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    dense_X_train = X_train_vec.toarray()
    dense_X_test = X_test_vec.toarray()
    data_var = np.var(dense_X_train, axis=0).sum()

    pca = PCA(n_components=num_features, random_state=seed)
    pca.fit(dense_X_train)
    X_train_pca = pca.transform(dense_X_train)
    X_test_pca = pca.transform(dense_X_test)
    
    clf =  SVC()
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    
    explained_variance = 1.0*np.var(X_train_pca, axis=0).sum() / data_var
    
    return vectorizer, pca, clf, y_pred, explained_variance