Skip to content
Branch: master
Find file Copy path
Find file Copy path
1 contributor

Users who have contributed to this file

59 lines (46 sloc) 2.15 KB
import sklearn
import numpy as np
from glob import glob
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
def pathfinder(targetPath):
path_string = targetPath.replace('/path/to/project/folder/','')
path_string = path_string.strip('/')
return path_string
# Get paths to labelled data
rawFolderPaths = glob("/path/to/project/folder/*/")
print ('\nGathering labelled categories...\n')
categories = []
# Extract the folder paths, reduce down to the label and append to the categories list
for i in rawFolderPaths:
category = pathfinder(i)
# Load the data
print ('\nLoading the dataset...\n')
docs_to_train = sklearn.datasets.load_files("/Users/danielhoadley/PycharmProjects/trainer/!labelled_data_reportXML",
description=None, categories=categories, load_content=True,
encoding='utf-8', shuffle=True, random_state=42)
# Split the dataset into training and testing sets
print ('\nBuilding out hold-out test sample...\n')
X_train, X_test, y_train, y_test = train_test_split(,, test_size=0.4)
# Construct the classifier pipeline using a SGDClassifier algorithm
print ('\nApplying the classifier...\n')
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer(use_idf=True)),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42, verbose=1)),
# Fit the model to the training data, y_train)
# Run the test data into the model
predicted = text_clf.predict(X_test)
# Calculate mean accuracy of predictions
print (np.mean(predicted == y_test))
# Generate labelled performance metrics
print(metrics.classification_report(y_test, predicted,
You can’t perform that action at this time.