In [31]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd

# Read in Dataset

In [6]:
project_data = pd.read_pickle('../Data/EWS_Published Project_Listing_DD.pkl')

In [7]:
data = pd.read_pickle('../Data/Feedly_Processed_DF_cleaned.pkl')

In [11]:
train_data = pd.read_csv('../Data/Labeled_Data/sectors.csv')

In [16]:
data_with_sectors = data.merge(train_data[['article_id', 'Sectors', 'cl_Sector', 'top_sector']],
                    on='article_id')

# Process train data

In [18]:
labeled_data = data_with_sectors.dropna(how='all',
    subset=['Sectors', 'cl_Sector', 'top_sector'])

In [19]:
labeled_data.shape

(296, 14)

In [21]:
labeled_data.columns

Index(['article_id', 'title', 'url', 'feed_label', 'content', 'published',
       'summary', 'article_text', 'article_keywords', 'article_text_len',
       'top_lang', 'Sectors', 'cl_Sector', 'top_sector'],
      dtype='object')

In [25]:
labeled_data_horiz = labeled_data.merge(pd.get_dummies(labeled_data['cl_Sector']),
                                        left_index=True, right_index=True)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
article_matrix = vectorizer.fit_transform(labeled_data_horiz['article_text'])

In [36]:
dir(article_matrix)

['__abs__',
 '__add__',
 '__array_priority__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__idiv__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmatmul__',
 '__rmul__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '_add_dense',
 '_add_sparse',
 '_arg_min_or_max',
 '_arg_min_or_max_axis',
 '_binopt',
 '_boolean_index_to_array',
 '_check_boolean',
 '_check_ellipsis',
 '_cs_matrix__get_has_canonical_format',
 '_cs_matrix__get_sorted',
 '_cs_matrix_

In [40]:
article_matrix.indices

array([7360,  627, 2307, ..., 1958, 5196, 5292], dtype=int32)

['00',
 '000',
 '000megawatts',
 '01',
 '02',
 '020',
 '04',
 '042',
 '047',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '1005',
 '100bn',
 '100m',
 '100mw',
 '105',
 '106',
 '10km',
 '10m',
 '10th',
 '11',
 '110',
 '110km',
 '113',
 '114',
 '114mw',
 '116',
 '118',
 '118m',
 '119',
 '11bn',
 '11gw',
 '12',
 '120',
 '12000',
 '120km',
 '121',
 '125',
 '125bn',
 '125km',
 '126',
 '13',
 '130',
 '130km',
 '130m',
 '132',
 '135',
 '136km',
 '138m',
 '139',
 '14',
 '140',
 '140mn',
 '142',
 '143',
 '144',
 '145',
 '147',
 '148',
 '149',
 '14km',
 '15',
 '150',
 '1500',
 '151',
 '151kms',
 '153',
 '155',
 '158',
 '15km',
 '16',
 '160',
 '161',
 '163',
 '164',
 '165',
 '169',
 '17',
 '175',
 '18',
 '180',
 '180m',
 '183',
 '184',
 '188',
 '18m',
 '18th',
 '19',
 '190',
 '191',
 '1913',
 '192',
 '1925',
 '1930s',
 '194',
 '195',
 '1950',
 '195m',
 '1960',
 '1960s',
 '1966',
 '1972',
 '1973',
 '1974',
 '1975',
 '1979',
 '1980',
 '1982',
 '1986',
 '1989',
 '1990s',
 '1991',
 '1992',
 '19